def test_origin_empty_snapshot_null_revision(client, archive_data, new_origin): snapshot = Snapshot( branches={ b"HEAD": SnapshotBranch( target="refs/head/master".encode(), target_type=TargetType.ALIAS, ), b"refs/head/master": None, } ) archive_data.origin_add([new_origin]) archive_data.snapshot_add([snapshot]) visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=now(), type="git",)] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="partial", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) resp_content = resp.content.decode("utf-8") assert re.search("snapshot.*is empty", resp_content) assert not re.search("swh-tr-link", resp_content)
def test_visit_and_snapshot_get_from_revision(swh_storage, sample_data): origin = sample_data.origin swh_storage.origin_add([origin]) date_visit2 = now() visit1, visit2 = sample_data.origin_visits[:2] assert visit1.origin == origin.url ov1, ov2 = swh_storage.origin_visit_add([visit1, visit2]) revision1, revision2, revision3 = sample_data.revisions[:3] swh_storage.revision_add([revision1, revision2]) empty_snapshot, complete_snapshot = sample_data.snapshots[1:3] swh_storage.snapshot_add([complete_snapshot]) # Add complete_snapshot to visit1 which targets revision1 ovs1, ovs2 = [ OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=date_visit2, type=ov1.type, status="partial", snapshot=complete_snapshot.id, ), OriginVisitStatus( origin=ov2.origin, visit=ov2.visit, date=now(), type=ov2.type, status="full", snapshot=empty_snapshot.id, ), ] swh_storage.origin_visit_status_add([ovs1, ovs2]) assert ov1.date < ov2.date assert ov2.date < ovs1.date assert ovs1.date < ovs2.date # revision3 does not exist so result is None actual_snapshot_id = snapshot_id_get_from_revision(swh_storage, origin.url, revision3.id) assert actual_snapshot_id is None # no snapshot targets revision2 for origin.url so result is None res = list( visits_and_snapshots_get_from_revision(swh_storage, origin.url, revision2.id)) assert res == [] # complete_snapshot targets at least revision1 res = list( visits_and_snapshots_get_from_revision(swh_storage, origin.url, revision1.id)) assert res == [(ov1, ovs1, complete_snapshot)]
def test_iter_origin_visits(swh_storage, sample_data): """Iter over origin visits for an origin returns all visits""" origin1, origin2 = sample_data.origins[:2] swh_storage.origin_add([origin1, origin2]) date_past = now() - datetime.timedelta(weeks=20) new_visits = [] for visit_id in range(20): new_visits.append( OriginVisit( origin=origin1.url, date=date_past + datetime.timedelta(days=visit_id), type="git", )) visits = swh_storage.origin_visit_add(new_visits) reversed_visits = list(reversed(visits)) # no limit, order asc actual_visits = list(iter_origin_visits(swh_storage, origin1.url)) assert actual_visits == visits # no limit, order desc actual_visits = list( iter_origin_visits(swh_storage, origin1.url, order=ListOrder.DESC)) assert actual_visits == reversed_visits # no result actual_visits = list(iter_origin_visits(swh_storage, origin2.url)) assert actual_visits == []
def test_content_add_race(self, swh_storage, sample_data): content = attr.evolve(sample_data.content, ctime=now()) results = queue.Queue() def thread(): try: with db_transaction(swh_storage) as (db, cur): ret = swh_storage._content_add_metadata(db, cur, [content]) results.put((threading.get_ident(), "data", ret)) except Exception as e: results.put((threading.get_ident(), "exc", e)) t1 = threading.Thread(target=thread) t2 = threading.Thread(target=thread) t1.start() # this avoids the race condition # import time # time.sleep(1) t2.start() t1.join() t2.join() r1 = results.get(block=False) r2 = results.get(block=False) with pytest.raises(queue.Empty): results.get(block=False) assert r1[0] != r2[0] assert r1[1] == "data", "Got exception %r in Thread%s" % (r1[2], r1[0]) assert r2[1] == "data", "Got exception %r in Thread%s" % (r2[2], r2[0])
def test_content_add_metadata_db(self, swh_storage, sample_data): content = attr.evolve(sample_data.content, data=None, ctime=now()) actual_result = swh_storage.content_add_metadata([content]) assert actual_result == { "content:add": 1, } if hasattr(swh_storage, "objstorage"): assert content.sha1 not in swh_storage.objstorage.objstorage with db_transaction(swh_storage) as (_, cur): cur.execute( "SELECT sha1, sha1_git, sha256, length, status" " FROM content WHERE sha1 = %s", (content.sha1,), ) datum = cur.fetchone() assert datum == ( content.sha1, content.sha1_git, content.sha256, content.length, "visible", ) contents = [ obj for (obj_type, obj) in swh_storage.journal_writer.journal.objects if obj_type == "content" ] assert len(contents) == 1 assert contents[0] == content
def test_get_origin_visit_return_first_valid_partial_visit( archive_data, new_origin, new_snapshots): visits = [] archive_data.origin_add([new_origin]) # create 6 visits, the first three have full status but null snapshot # while the last three have partial status with valid snapshot for i, snp in enumerate(new_snapshots): visit_date = now() + timedelta(days=i * 10) visit = archive_data.origin_visit_add([ OriginVisit( origin=new_origin.url, date=visit_date, type="git", ) ])[0] archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=visit_date + timedelta(minutes=5), status="full" if i < 3 else "partial", snapshot=new_snapshots[i].id if i > 2 else None, ) if i > 2: archive_data.origin_visit_status_add([visit_status]) visits.append(visit.visit) # should return the last visit expected_visit = archive_data.origin_visit_get_by(new_origin.url, visits[-1]) assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit
def test_get_origin_visit_non_resolvable_snapshots(archive_data, new_origin, new_snapshots): visits = [] archive_data.origin_add([new_origin]) # create 6 full visits, the first three have resolvable snapshots # while the last three have non resolvable snapshots for i, snp in enumerate(new_snapshots): visit_date = now() + timedelta(days=i * 10) visit = archive_data.origin_visit_add([ OriginVisit( origin=new_origin.url, date=visit_date, type="git", ) ])[0] archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=visit_date + timedelta(minutes=5), status="full", snapshot=new_snapshots[i].id, ) if i < 3: archive_data.origin_visit_status_add([visit_status]) visits.append(visit.visit) # should return the third visit expected_visit = archive_data.origin_visit_get_by(new_origin.url, visits[2]) assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit
def test_origin_snapshot_invalid_branch( client, archive_data, new_origin, new_snapshot, visit_dates, revisions ): snp_dict = new_snapshot.to_dict() archive_data.origin_add([new_origin]) for i, branch in enumerate(snp_dict["branches"].keys()): snp_dict["branches"][branch] = { "target_type": "revision", "target": hash_to_bytes(revisions[i]), } archive_data.snapshot_add([Snapshot.from_dict(snp_dict)]) visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="full", snapshot=snp_dict["id"], ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url, "branch": "invalid_branch"}, ) check_html_get_response(client, url, status_code=404, template_used="error.html")
def _add_origin(storage, search, origin_url, visit_type="git", snapshot_branches={}): storage.origin_add([Origin(url=origin_url)]) search.origin_update([{ "url": origin_url, "has_visits": True, "visit_types": [visit_type] }]) date = now() visit = OriginVisit(origin=origin_url, date=date, type=visit_type) visit = storage.origin_visit_add([visit])[0] snapshot = Snapshot.from_dict({"branches": snapshot_branches}) storage.snapshot_add([snapshot]) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=date + timedelta(minutes=1), type=visit.type, status="full", snapshot=snapshot.id, ) storage.origin_visit_status_add([visit_status])
def test_iter_origin_visit_status(swh_storage, sample_data): origin1, origin2 = sample_data.origins[:2] swh_storage.origin_add([origin1]) ov1 = swh_storage.origin_visit_add([sample_data.origin_visit])[0] assert ov1.origin == origin1.url date_past = now() - datetime.timedelta(weeks=20) ovs1 = OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=ov1.date, type=ov1.type, status="created", snapshot=None, ) new_visit_statuses = [ovs1] for i in range(20): status_date = date_past + datetime.timedelta(days=i) new_visit_statuses.append( OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=status_date, type=ov1.type, status="created", snapshot=None, )) swh_storage.origin_visit_status_add(new_visit_statuses) reversed_visit_statuses = list(reversed(new_visit_statuses)) # order asc actual_visit_statuses = list( iter_origin_visit_statuses(swh_storage, ov1.origin, ov1.visit)) assert actual_visit_statuses == new_visit_statuses # order desc actual_visit_statuses = list( iter_origin_visit_statuses(swh_storage, ov1.origin, ov1.visit, order=ListOrder.DESC)) assert actual_visit_statuses == reversed_visit_statuses # no result actual_visit_statuses = list( iter_origin_visit_statuses(swh_storage, origin2.url, ov1.visit)) assert actual_visit_statuses == []
def test_api_lookup_origin_visits(api_client, archive_data, new_origin, visit_dates, new_snapshots): archive_data.origin_add([new_origin]) for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add([ OriginVisit( origin=new_origin.url, date=visit_date, type="git", ) ])[0] archive_data.snapshot_add([new_snapshots[i]]) visit_status = OriginVisitStatus( origin=new_origin.url, visit=origin_visit.visit, date=now(), status="full", snapshot=new_snapshots[i].id, ) archive_data.origin_visit_status_add([visit_status]) all_visits = list(reversed(get_origin_visits(new_origin.to_dict()))) for last_visit, expected_visits in ( (None, all_visits[:2]), (all_visits[1]["visit"], all_visits[2:]), ): url = reverse( "api-1-origin-visits", url_args={"origin_url": new_origin.url}, query_params={ "per_page": 2, "last_visit": last_visit }, ) rv = check_api_get_responses(api_client, url, status_code=200) for i in range(len(expected_visits)): expected_visits[i] = enrich_origin_visit( expected_visits[i], with_origin_link=False, with_origin_visit_link=True, request=rv.wsgi_request, ) assert rv.data == expected_visits
def fill_storage(storage): storage.origin_add(ORIGINS) storage.directory_add([DIRECTORY, DIRECTORY2]) storage.revision_add(REVISIONS) storage.snapshot_add(SNAPSHOTS) for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS): assert snapshot.id is not None visit = storage.origin_visit_add( [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])] )[0] visit_status = OriginVisitStatus( origin=visit.origin, visit=visit.visit, date=now(), status="full", snapshot=snapshot.id, ) storage.origin_visit_status_add([visit_status]) contents = [] for (obj_id, content) in OBJ_STORAGE_DATA.items(): content_hashes = hashutil.MultiHash.from_data(content).digest() contents.append( Content( data=content, length=len(content), status="visible", sha1=hash_to_bytes(obj_id), sha1_git=hash_to_bytes(obj_id), sha256=content_hashes["sha256"], blake2s256=content_hashes["blake2s256"], ) ) storage.content_add(contents)
def test_origin_browse_directory_branch_with_non_resolvable_revision( client, archive_data, new_origin, unknown_revision ): branch_name = "master" snapshot = Snapshot( branches={ branch_name.encode(): SnapshotBranch( target=hash_to_bytes(unknown_revision), target_type=TargetType.REVISION, ) } ) archive_data.origin_add([new_origin]) archive_data.snapshot_add([snapshot]) visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=now(), type="git",)] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="partial", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "browse-origin-directory", query_params={"origin_url": new_origin.url, "branch": branch_name}, ) resp = check_html_get_response( client, url, status_code=200, template_used="browse/directory.html" ) assert_contains( resp, f"Revision {unknown_revision } could not be found in the archive." )
def test_content_get_partition_murmur3_collision(self, swh_storage, mocker, sample_data): """The Murmur3 token is used as link from index tables to the main table; and non-matching contents with colliding murmur3-hash are filtered-out when reading the main table. This test checks the content_get_partition endpoints return all contents, even the collisions. """ called = 0 rows: Dict[int, Dict] = {} for tok, content in enumerate(sample_data.contents): cont = attr.evolve(content, data=None, ctime=now()) row_d = {**cont.to_dict(), "tok": tok} rows[tok] = row_d # For all tokens, always return cont def mock_content_get_token_range(range_start, range_end, limit): nonlocal called called += 1 for tok in list( rows.keys()) * 3: # yield multiple times the same tok row_d = dict(rows[tok].items()) row_d.pop("tok") yield (tok, ContentRow(**row_d)) mocker.patch.object( swh_storage._cql_runner, "content_get_token_range", mock_content_get_token_range, ) actual_results = list( stream_results(swh_storage.content_get_partition, partition_id=0, nb_partitions=1)) assert called > 0 # everything is listed, even collisions assert len(actual_results) == 3 * len(sample_data.contents) # as we duplicated the returned results, dropping duplicate should yield # the original length assert len(set(actual_results)) == len(sample_data.contents)
def test_content_find_murmur3_collision(self, swh_storage, mocker, sample_data): """The Murmur3 token is used as link from index tables to the main table; and non-matching contents with colliding murmur3-hash are filtered-out when reading the main table. This test checks the content methods do filter out these collisions. """ called = 0 cont, cont2 = [ attr.evolve(c, ctime=now()) for c in sample_data.contents[:2] ] # always return a token def mock_cgtfsa(algo, hashes): nonlocal called called += 1 assert algo in ("sha1", "sha1_git") return [123456] mocker.patch.object( swh_storage._cql_runner, "content_get_tokens_from_single_algo", mock_cgtfsa, ) # For all tokens, always return cont and cont2 cols = list(set(cont.to_dict()) - {"data"}) def mock_cgft(tokens): nonlocal called called += 1 return [ ContentRow(**{col: getattr(cont, col) for col in cols}) for cont in [cont, cont2] ] mocker.patch.object(swh_storage._cql_runner, "content_get_from_tokens", mock_cgft) expected_content = attr.evolve(cont, data=None) actual_result = swh_storage.content_find({"sha1": cont.sha1}) assert called == 2 # but cont2 should be filtered out assert actual_result == [expected_content]
def test_api_lookup_origin_visit_latest_with_snapshot(api_client, archive_data, new_origin, visit_dates, new_snapshots): archive_data.origin_add([new_origin]) visit_dates.sort() visit_ids = [] for i, visit_date in enumerate(visit_dates): origin_visit = archive_data.origin_visit_add([ OriginVisit( origin=new_origin.url, date=visit_date, type="git", ) ])[0] visit_ids.append(origin_visit.visit) archive_data.snapshot_add([new_snapshots[0]]) # Add snapshot to the latest visit visit_id = visit_ids[-1] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit_id, date=now(), status="full", snapshot=new_snapshots[0].id, ) archive_data.origin_visit_status_add([visit_status]) url = reverse( "api-1-origin-visit-latest", url_args={"origin_url": new_origin.url}, query_params={"require_snapshot": True}, ) rv = check_api_get_responses(api_client, url, status_code=200) expected_visit = archive_data.origin_visit_status_get_latest( new_origin.url, type="git", require_snapshot=True) expected_visit = enrich_origin_visit( expected_visit, with_origin_link=True, with_origin_visit_link=False, request=rv.wsgi_request, ) assert rv.data == expected_visit
def test_retrying_proxy_storage_content_add_metadata(swh_storage, sample_data): """Standard content_add_metadata works as before""" sample_content = sample_data.content content = attr.evolve(sample_content, data=None) pk = content.sha1 content_metadata = swh_storage.content_get([pk]) assert content_metadata == [None] s = swh_storage.content_add_metadata([attr.evolve(content, ctime=now())]) assert s == { "content:add": 1, } content_metadata = swh_storage.content_get([pk]) assert len(content_metadata) == 1 assert content_metadata[0].sha1 == pk
def test_origin_branches_pagination_with_alias( client, archive_data, mocker, new_origin, visit_dates, revisions, existing_release ): """ When a snapshot contains a branch or a release alias, pagination links in the branches / releases view should be displayed. """ mocker.patch("swh.web.browse.snapshot_context.PER_PAGE", len(revisions) / 2) snp_dict = {"branches": {}, "id": hash_to_bytes(random_sha1())} for i in range(len(revisions)): branch = "".join(random.choices(string.ascii_lowercase, k=8)) snp_dict["branches"][branch.encode()] = { "target_type": "revision", "target": hash_to_bytes(revisions[i]), } release = "".join(random.choices(string.ascii_lowercase, k=8)) snp_dict["branches"][b"RELEASE_ALIAS"] = { "target_type": "alias", "target": release.encode(), } snp_dict["branches"][release.encode()] = { "target_type": "release", "target": hash_to_bytes(existing_release), } archive_data.origin_add([new_origin]) archive_data.snapshot_add([Snapshot.from_dict(snp_dict)]) visit = archive_data.origin_visit_add( [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)] )[0] visit_status = OriginVisitStatus( origin=new_origin.url, visit=visit.visit, date=now(), status="full", snapshot=snp_dict["id"], ) archive_data.origin_visit_status_add([visit_status]) url = reverse("browse-origin-branches", query_params={"origin_url": new_origin.url}) resp = check_html_get_response( client, url, status_code=200, template_used="browse/branches.html" ) assert_contains(resp, '<ul class="pagination')
def test_pypi_missing_branch(self): origin_url = "https://pypi.org/project/abcdef/" self.indexer.storage.origin_add([Origin(url=origin_url, )]) visit = self.indexer.storage.origin_visit_add([ OriginVisit( origin=origin_url, date=datetime(2019, 2, 27, tzinfo=timezone.utc), type="pypi", ) ])[0] self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT]) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=now(), status="full", snapshot=SAMPLE_SNAPSHOT.id, ) self.indexer.storage.origin_visit_status_add([visit_status]) self.indexer.run(["https://pypi.org/project/abcdef/"]) self.assertEqual(self.indexer.results, [])
def test_origin_get_latest_visit_status_filter_snapshot( swh_storage, sample_data): objects = init_storage_with_origin_visits(swh_storage, sample_data) origin1, origin2 = objects["origin"] _, ov2 = objects["origin_visit"] _, _, _, ovs22 = objects["origin_visit_status"] # there is no visit with snapshot yet for that visit assert (origin_get_latest_visit_status( swh_storage, origin1.url, require_snapshot=True) is None) # visit status with partial status visit elected actual_ovs22 = origin_get_latest_visit_status(swh_storage, origin2.url, require_snapshot=True) assert actual_ovs22 == ovs22 assert actual_ovs22.origin == ov2.origin assert actual_ovs22.visit == ov2.visit assert actual_ovs22.type == ov2.type date_now = now() # Add another visit swh_storage.origin_visit_add([ OriginVisit( origin=origin2.url, date=date_now, type=sample_data.type_visit2, ), ]) # Requiring the latest visit with a snapshot, we still find the previous visit ovs22 = origin_get_latest_visit_status(swh_storage, origin2.url, require_snapshot=True) assert actual_ovs22 == ovs22 assert actual_ovs22.origin == ov2.origin assert actual_ovs22.visit == ov2.visit assert actual_ovs22.type == ov2.type
def test_git_partial_snapshot(self): """Checks partial snapshots are ignored.""" origin_url = "https://github.com/SoftwareHeritage/swh-core" self.indexer.storage.origin_add([Origin(url=origin_url)]) visit = self.indexer.storage.origin_visit_add([ OriginVisit( origin=origin_url, date=datetime(2019, 2, 27, tzinfo=timezone.utc), type="git", ) ])[0] self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT]) visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=now(), status="partial", snapshot=SAMPLE_SNAPSHOT.id, ) self.indexer.storage.origin_visit_status_add([visit_status]) self.indexer.run([origin_url]) self.assertEqual(self.indexer.results, [])
def init_storage_with_origin_visits(swh_storage, sample_data): """Initialize storage with origin/origin-visit/origin-visit-status""" snapshot = sample_data.snapshots[2] origin1, origin2 = sample_data.origins[:2] swh_storage.origin_add([origin1, origin2]) ov1, ov2 = swh_storage.origin_visit_add([ OriginVisit( origin=origin1.url, date=sample_data.date_visit1, type=sample_data.type_visit1, ), OriginVisit( origin=origin2.url, date=sample_data.date_visit2, type=sample_data.type_visit2, ), ]) swh_storage.snapshot_add([snapshot]) date_now = now() date_now = round_to_milliseconds(date_now) assert sample_data.date_visit1 < sample_data.date_visit2 assert sample_data.date_visit2 < date_now # origin visit status 1 for origin visit 1 ovs11 = OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=ov1.date + datetime.timedelta(seconds=10), # so it's not ignored type=ov1.type, status="partial", snapshot=None, ) # origin visit status 2 for origin visit 1 ovs12 = OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=sample_data.date_visit2, type=ov1.type, status="ongoing", snapshot=None, ) # origin visit status 1 for origin visit 2 ovs21 = OriginVisitStatus( origin=ov2.origin, visit=ov2.visit, date=ov2.date + datetime.timedelta(seconds=10), # so it's not ignored type=ov2.type, status="ongoing", snapshot=None, ) # origin visit status 2 for origin visit 2 ovs22 = OriginVisitStatus( origin=ov2.origin, visit=ov2.visit, date=date_now, type=ov2.type, status="full", snapshot=snapshot.id, metadata={"something": "wicked"}, ) swh_storage.origin_visit_status_add([ovs11, ovs12, ovs21, ovs22]) return { "origin": [origin1, origin2], "origin_visit": [ov1, ov2], "origin_visit_status": [ovs11, ovs12, ovs21, ovs22], }
return d testdata = [ pytest.param( "content", "content_add", list(TEST_OBJECTS["content"]), attr.evolve(model.Content.from_data(data=b"too big"), length=1000), attr.evolve(model.Content.from_data(data=b"to fail"), length=1000), id="content", ), pytest.param( "content", "content_add_metadata", [attr.evolve(cnt, ctime=now()) for cnt in TEST_OBJECTS["content"]], attr.evolve(model.Content.from_data(data=b"too big"), length=1000, ctime=now()), attr.evolve(model.Content.from_data(data=b"to fail"), length=1000, ctime=now()), id="content_metadata", ), pytest.param( "skipped_content", "skipped_content_add", list(TEST_OBJECTS["skipped_content"]), attr.evolve( model.SkippedContent.from_data(data=b"too big", reason="too big"), length=1000, ), attr.evolve( model.SkippedContent.from_data(data=b"to fail", reason="to fail"), length=1000,
def test_snapshot_get_latest(swh_storage, sample_data): origin = sample_data.origin swh_storage.origin_add([origin]) visit1, visit2 = sample_data.origin_visits[:2] assert visit1.origin == origin.url swh_storage.origin_visit_add([visit1]) ov1 = swh_storage.origin_visit_get_latest(origin.url) # Add snapshot to visit1, latest snapshot = visit 1 snapshot complete_snapshot = sample_data.snapshots[2] swh_storage.snapshot_add([complete_snapshot]) swh_storage.origin_visit_status_add([ OriginVisitStatus( origin=origin.url, visit=ov1.visit, date=visit2.date, status="partial", snapshot=None, ) ]) assert visit1.date < visit2.date # no snapshot associated to the visit, so None actual_snapshot = snapshot_get_latest(swh_storage, origin.url, allowed_statuses=["partial"]) assert actual_snapshot is None date_now = now() assert visit2.date < date_now swh_storage.origin_visit_status_add([ OriginVisitStatus( origin=ov1.origin, visit=ov1.visit, date=date_now, type=ov1.type, status="full", snapshot=complete_snapshot.id, ) ]) swh_storage.origin_visit_add( [OriginVisit( origin=origin.url, date=now(), type=visit1.type, )]) actual_snapshot = snapshot_get_latest(swh_storage, origin.url) assert actual_snapshot is not None assert actual_snapshot == complete_snapshot actual_snapshot = snapshot_get_latest(swh_storage, origin.url, branches_count=1) assert actual_snapshot is not None assert actual_snapshot.id == complete_snapshot.id assert len(actual_snapshot.branches.values()) == 1 with pytest.raises(ValueError, match="branches_count must be a positive integer"): snapshot_get_latest(swh_storage, origin.url, branches_count="something-wrong")
def load(self) -> Dict[str, str]: r"""Loading logic for the loader to follow: - Store the actual ``origin_visit`` to storage - Call :meth:`prepare` to prepare any eventual state - Call :meth:`get_origin` to get the origin we work with and store - while True: - Call :meth:`fetch_data` to fetch the data to store - Call :meth:`process_data` to optionally run processing between :meth:`fetch_data` and :meth:`store_data` - Call :meth:`store_data` to store the data - Call :meth:`cleanup` to clean up any eventual state put in place in :meth:`prepare` method. """ try: with self.statsd_timed("pre_cleanup"): self.pre_cleanup() except Exception: msg = "Cleaning up dangling data failed! Continue loading." self.log.warning(msg) sentry_sdk.capture_exception() self._store_origin_visit() assert ( self.visit.visit ), "The method `_store_origin_visit` should set the visit (OriginVisit)" self.log.info("Load origin '%s' with type '%s'", self.origin.url, self.visit.type) try: with self.statsd_timed("build_extrinsic_origin_metadata"): metadata = self.build_extrinsic_origin_metadata() self.load_metadata_objects(metadata) except Exception as e: sentry_sdk.capture_exception(e) # Do not fail the whole task if this is the only failure self.log.exception( "Failure while loading extrinsic origin metadata.", extra={ "swh_task_args": [], "swh_task_kwargs": { "origin": self.origin.url, "lister_name": self.lister_name, "lister_instance_name": self.lister_instance_name, }, }, ) total_time_fetch_data = 0.0 total_time_process_data = 0.0 total_time_store_data = 0.0 try: # Initially not a success, will be True when actually one success = False with self.statsd_timed("prepare"): self.prepare() while True: t1 = time.monotonic() more_data_to_fetch = self.fetch_data() t2 = time.monotonic() total_time_fetch_data += t2 - t1 more_data_to_fetch = self.process_data() and more_data_to_fetch t3 = time.monotonic() total_time_process_data += t3 - t2 self.store_data() t4 = time.monotonic() total_time_store_data += t4 - t3 if not more_data_to_fetch: break self.statsd_timing("fetch_data", total_time_fetch_data * 1000.0) self.statsd_timing("process_data", total_time_process_data * 1000.0) self.statsd_timing("store_data", total_time_store_data * 1000.0) status = self.visit_status() visit_status = OriginVisitStatus( origin=self.origin.url, visit=self.visit.visit, type=self.visit_type, date=now(), status=status, snapshot=self.loaded_snapshot_id, ) self.storage.origin_visit_status_add([visit_status]) success = True with self.statsd_timed("post_load", tags={ "success": success, "status": status }): self.post_load() except BaseException as e: success = False if isinstance(e, NotFound): status = "not_found" task_status = "uneventful" else: status = "partial" if self.loaded_snapshot_id else "failed" task_status = "failed" self.log.exception( "Loading failure, updating to `%s` status", status, extra={ "swh_task_args": [], "swh_task_kwargs": { "origin": self.origin.url, "lister_name": self.lister_name, "lister_instance_name": self.lister_instance_name, }, }, ) if not isinstance(e, (SystemExit, KeyboardInterrupt)): sentry_sdk.capture_exception() visit_status = OriginVisitStatus( origin=self.origin.url, visit=self.visit.visit, type=self.visit_type, date=now(), status=status, snapshot=self.loaded_snapshot_id, ) self.storage.origin_visit_status_add([visit_status]) with self.statsd_timed("post_load", tags={ "success": success, "status": status }): self.post_load(success=success) if not isinstance(e, Exception): # e derives from BaseException but not Exception; this is most likely # SystemExit or KeyboardInterrupt, so we should re-raise it. raise return {"status": task_status} finally: with self.statsd_timed("flush", tags={ "success": success, "status": status }): self.flush() with self.statsd_timed("cleanup", tags={ "success": success, "status": status }): self.cleanup() return self.load_status()
def test_sub_directory_view_origin_context(client, archive_data, empty_directory, person, date): origin_url = "test_sub_directory_view_origin_context" subdir = Directory(entries=( DirectoryEntry( name=b"foo", type="dir", target=hash_to_bytes(empty_directory), perms=DentryPerms.directory, ), DirectoryEntry( name=b"bar", type="dir", target=hash_to_bytes(empty_directory), perms=DentryPerms.directory, ), )) parentdir = Directory(entries=(DirectoryEntry( name=b"baz", type="dir", target=subdir.id, perms=DentryPerms.directory, ), )) archive_data.directory_add([subdir, parentdir]) revision = Revision( directory=parentdir.id, author=person, committer=person, message=b"commit message", date=TimestampWithTimezone.from_datetime(date), committer_date=TimestampWithTimezone.from_datetime(date), synthetic=False, type=RevisionType.GIT, ) archive_data.revision_add([revision]) snapshot = Snapshot( branches={ b"HEAD": SnapshotBranch( target="refs/head/master".encode(), target_type=TargetType.ALIAS, ), b"refs/head/master": SnapshotBranch( target=revision.id, target_type=TargetType.REVISION, ), }) archive_data.snapshot_add([snapshot]) archive_data.origin_add([Origin(url=origin_url)]) date = now() visit = OriginVisit(origin=origin_url, date=date, type="git") visit = archive_data.origin_visit_add([visit])[0] visit_status = OriginVisitStatus( origin=origin_url, visit=visit.visit, date=date, status="full", snapshot=snapshot.id, ) archive_data.origin_visit_status_add([visit_status]) dir_content = archive_data.directory_ls(hash_to_hex(parentdir.id)) subdir = dir_content[0] subdir_content = archive_data.directory_ls(subdir["target"]) _directory_view_checks( client, hash_to_hex(parentdir.id), subdir_content, subdir["name"], origin_url, hash_to_hex(snapshot.id), hash_to_hex(revision.id), )