コード例 #1
0
ファイル: test_origin.py プロジェクト: shivam2003sy/swh-web
def test_origin_empty_snapshot_null_revision(client, archive_data, new_origin):
    snapshot = Snapshot(
        branches={
            b"HEAD": SnapshotBranch(
                target="refs/head/master".encode(), target_type=TargetType.ALIAS,
            ),
            b"refs/head/master": None,
        }
    )
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([snapshot])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=now(), type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="partial",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory", query_params={"origin_url": new_origin.url},
    )

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/directory.html"
    )
    resp_content = resp.content.decode("utf-8")
    assert re.search("snapshot.*is empty", resp_content)
    assert not re.search("swh-tr-link", resp_content)
コード例 #2
0
def test_visit_and_snapshot_get_from_revision(swh_storage, sample_data):
    origin = sample_data.origin
    swh_storage.origin_add([origin])

    date_visit2 = now()
    visit1, visit2 = sample_data.origin_visits[:2]
    assert visit1.origin == origin.url

    ov1, ov2 = swh_storage.origin_visit_add([visit1, visit2])

    revision1, revision2, revision3 = sample_data.revisions[:3]
    swh_storage.revision_add([revision1, revision2])

    empty_snapshot, complete_snapshot = sample_data.snapshots[1:3]
    swh_storage.snapshot_add([complete_snapshot])

    # Add complete_snapshot to visit1 which targets revision1
    ovs1, ovs2 = [
        OriginVisitStatus(
            origin=ov1.origin,
            visit=ov1.visit,
            date=date_visit2,
            type=ov1.type,
            status="partial",
            snapshot=complete_snapshot.id,
        ),
        OriginVisitStatus(
            origin=ov2.origin,
            visit=ov2.visit,
            date=now(),
            type=ov2.type,
            status="full",
            snapshot=empty_snapshot.id,
        ),
    ]

    swh_storage.origin_visit_status_add([ovs1, ovs2])
    assert ov1.date < ov2.date
    assert ov2.date < ovs1.date
    assert ovs1.date < ovs2.date

    # revision3 does not exist so result is None
    actual_snapshot_id = snapshot_id_get_from_revision(swh_storage, origin.url,
                                                       revision3.id)
    assert actual_snapshot_id is None

    # no snapshot targets revision2 for origin.url so result is None
    res = list(
        visits_and_snapshots_get_from_revision(swh_storage, origin.url,
                                               revision2.id))
    assert res == []

    # complete_snapshot targets at least revision1
    res = list(
        visits_and_snapshots_get_from_revision(swh_storage, origin.url,
                                               revision1.id))
    assert res == [(ov1, ovs1, complete_snapshot)]
コード例 #3
0
def test_iter_origin_visits(swh_storage, sample_data):
    """Iter over origin visits for an origin returns all visits"""
    origin1, origin2 = sample_data.origins[:2]
    swh_storage.origin_add([origin1, origin2])

    date_past = now() - datetime.timedelta(weeks=20)

    new_visits = []
    for visit_id in range(20):
        new_visits.append(
            OriginVisit(
                origin=origin1.url,
                date=date_past + datetime.timedelta(days=visit_id),
                type="git",
            ))

    visits = swh_storage.origin_visit_add(new_visits)
    reversed_visits = list(reversed(visits))

    # no limit, order asc
    actual_visits = list(iter_origin_visits(swh_storage, origin1.url))
    assert actual_visits == visits

    # no limit, order desc
    actual_visits = list(
        iter_origin_visits(swh_storage, origin1.url, order=ListOrder.DESC))
    assert actual_visits == reversed_visits

    # no result
    actual_visits = list(iter_origin_visits(swh_storage, origin2.url))
    assert actual_visits == []
コード例 #4
0
    def test_content_add_race(self, swh_storage, sample_data):
        content = attr.evolve(sample_data.content, ctime=now())

        results = queue.Queue()

        def thread():
            try:
                with db_transaction(swh_storage) as (db, cur):
                    ret = swh_storage._content_add_metadata(db, cur, [content])
                results.put((threading.get_ident(), "data", ret))
            except Exception as e:
                results.put((threading.get_ident(), "exc", e))

        t1 = threading.Thread(target=thread)
        t2 = threading.Thread(target=thread)
        t1.start()
        # this avoids the race condition
        # import time
        # time.sleep(1)
        t2.start()
        t1.join()
        t2.join()

        r1 = results.get(block=False)
        r2 = results.get(block=False)

        with pytest.raises(queue.Empty):
            results.get(block=False)
        assert r1[0] != r2[0]
        assert r1[1] == "data", "Got exception %r in Thread%s" % (r1[2], r1[0])
        assert r2[1] == "data", "Got exception %r in Thread%s" % (r2[2], r2[0])
コード例 #5
0
    def test_content_add_metadata_db(self, swh_storage, sample_data):
        content = attr.evolve(sample_data.content, data=None, ctime=now())

        actual_result = swh_storage.content_add_metadata([content])

        assert actual_result == {
            "content:add": 1,
        }

        if hasattr(swh_storage, "objstorage"):
            assert content.sha1 not in swh_storage.objstorage.objstorage
        with db_transaction(swh_storage) as (_, cur):
            cur.execute(
                "SELECT sha1, sha1_git, sha256, length, status"
                " FROM content WHERE sha1 = %s",
                (content.sha1,),
            )
            datum = cur.fetchone()
        assert datum == (
            content.sha1,
            content.sha1_git,
            content.sha256,
            content.length,
            "visible",
        )

        contents = [
            obj
            for (obj_type, obj) in swh_storage.journal_writer.journal.objects
            if obj_type == "content"
        ]
        assert len(contents) == 1
        assert contents[0] == content
コード例 #6
0
def test_get_origin_visit_return_first_valid_partial_visit(
        archive_data, new_origin, new_snapshots):
    visits = []

    archive_data.origin_add([new_origin])
    # create 6 visits, the first three have full status but null snapshot
    # while the last three have partial status with valid snapshot
    for i, snp in enumerate(new_snapshots):
        visit_date = now() + timedelta(days=i * 10)
        visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=visit.visit,
            date=visit_date + timedelta(minutes=5),
            status="full" if i < 3 else "partial",
            snapshot=new_snapshots[i].id if i > 2 else None,
        )
        if i > 2:
            archive_data.origin_visit_status_add([visit_status])

        visits.append(visit.visit)

    # should return the last visit
    expected_visit = archive_data.origin_visit_get_by(new_origin.url,
                                                      visits[-1])
    assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit
コード例 #7
0
def test_get_origin_visit_non_resolvable_snapshots(archive_data, new_origin,
                                                   new_snapshots):
    visits = []
    archive_data.origin_add([new_origin])
    # create 6 full visits, the first three have resolvable snapshots
    # while the last three have non resolvable snapshots
    for i, snp in enumerate(new_snapshots):
        visit_date = now() + timedelta(days=i * 10)
        visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=visit.visit,
            date=visit_date + timedelta(minutes=5),
            status="full",
            snapshot=new_snapshots[i].id,
        )
        if i < 3:
            archive_data.origin_visit_status_add([visit_status])
        visits.append(visit.visit)

    # should return the third visit
    expected_visit = archive_data.origin_visit_get_by(new_origin.url,
                                                      visits[2])
    assert get_origin_visit((OriginInfo(url=new_origin.url))) == expected_visit
コード例 #8
0
ファイル: test_origin.py プロジェクト: shivam2003sy/swh-web
def test_origin_snapshot_invalid_branch(
    client, archive_data, new_origin, new_snapshot, visit_dates, revisions
):
    snp_dict = new_snapshot.to_dict()
    archive_data.origin_add([new_origin])
    for i, branch in enumerate(snp_dict["branches"].keys()):
        snp_dict["branches"][branch] = {
            "target_type": "revision",
            "target": hash_to_bytes(revisions[i]),
        }

    archive_data.snapshot_add([Snapshot.from_dict(snp_dict)])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="full",
        snapshot=snp_dict["id"],
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory",
        query_params={"origin_url": new_origin.url, "branch": "invalid_branch"},
    )

    check_html_get_response(client, url, status_code=404, template_used="error.html")
コード例 #9
0
def _add_origin(storage,
                search,
                origin_url,
                visit_type="git",
                snapshot_branches={}):
    storage.origin_add([Origin(url=origin_url)])
    search.origin_update([{
        "url": origin_url,
        "has_visits": True,
        "visit_types": [visit_type]
    }])
    date = now()
    visit = OriginVisit(origin=origin_url, date=date, type=visit_type)
    visit = storage.origin_visit_add([visit])[0]
    snapshot = Snapshot.from_dict({"branches": snapshot_branches})
    storage.snapshot_add([snapshot])
    visit_status = OriginVisitStatus(
        origin=origin_url,
        visit=visit.visit,
        date=date + timedelta(minutes=1),
        type=visit.type,
        status="full",
        snapshot=snapshot.id,
    )
    storage.origin_visit_status_add([visit_status])
コード例 #10
0
def test_iter_origin_visit_status(swh_storage, sample_data):
    origin1, origin2 = sample_data.origins[:2]
    swh_storage.origin_add([origin1])

    ov1 = swh_storage.origin_visit_add([sample_data.origin_visit])[0]
    assert ov1.origin == origin1.url

    date_past = now() - datetime.timedelta(weeks=20)

    ovs1 = OriginVisitStatus(
        origin=ov1.origin,
        visit=ov1.visit,
        date=ov1.date,
        type=ov1.type,
        status="created",
        snapshot=None,
    )
    new_visit_statuses = [ovs1]
    for i in range(20):
        status_date = date_past + datetime.timedelta(days=i)

        new_visit_statuses.append(
            OriginVisitStatus(
                origin=ov1.origin,
                visit=ov1.visit,
                date=status_date,
                type=ov1.type,
                status="created",
                snapshot=None,
            ))

    swh_storage.origin_visit_status_add(new_visit_statuses)
    reversed_visit_statuses = list(reversed(new_visit_statuses))

    # order asc
    actual_visit_statuses = list(
        iter_origin_visit_statuses(swh_storage, ov1.origin, ov1.visit))
    assert actual_visit_statuses == new_visit_statuses

    # order desc
    actual_visit_statuses = list(
        iter_origin_visit_statuses(swh_storage,
                                   ov1.origin,
                                   ov1.visit,
                                   order=ListOrder.DESC))
    assert actual_visit_statuses == reversed_visit_statuses

    # no result
    actual_visit_statuses = list(
        iter_origin_visit_statuses(swh_storage, origin2.url, ov1.visit))
    assert actual_visit_statuses == []
コード例 #11
0
ファイル: test_origin.py プロジェクト: shivam2003sy/swh-web
def test_api_lookup_origin_visits(api_client, archive_data, new_origin,
                                  visit_dates, new_snapshots):

    archive_data.origin_add([new_origin])
    for i, visit_date in enumerate(visit_dates):
        origin_visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        archive_data.snapshot_add([new_snapshots[i]])
        visit_status = OriginVisitStatus(
            origin=new_origin.url,
            visit=origin_visit.visit,
            date=now(),
            status="full",
            snapshot=new_snapshots[i].id,
        )
        archive_data.origin_visit_status_add([visit_status])

    all_visits = list(reversed(get_origin_visits(new_origin.to_dict())))

    for last_visit, expected_visits in (
        (None, all_visits[:2]),
        (all_visits[1]["visit"], all_visits[2:]),
    ):

        url = reverse(
            "api-1-origin-visits",
            url_args={"origin_url": new_origin.url},
            query_params={
                "per_page": 2,
                "last_visit": last_visit
            },
        )

        rv = check_api_get_responses(api_client, url, status_code=200)

        for i in range(len(expected_visits)):
            expected_visits[i] = enrich_origin_visit(
                expected_visits[i],
                with_origin_link=False,
                with_origin_visit_link=True,
                request=rv.wsgi_request,
            )

        assert rv.data == expected_visits
コード例 #12
0
def fill_storage(storage):
    storage.origin_add(ORIGINS)
    storage.directory_add([DIRECTORY, DIRECTORY2])
    storage.revision_add(REVISIONS)
    storage.snapshot_add(SNAPSHOTS)

    for visit, snapshot in zip(ORIGIN_VISITS, SNAPSHOTS):
        assert snapshot.id is not None

        visit = storage.origin_visit_add(
            [OriginVisit(origin=visit["origin"], date=now(), type=visit["type"])]
        )[0]
        visit_status = OriginVisitStatus(
            origin=visit.origin,
            visit=visit.visit,
            date=now(),
            status="full",
            snapshot=snapshot.id,
        )
        storage.origin_visit_status_add([visit_status])

    contents = []
    for (obj_id, content) in OBJ_STORAGE_DATA.items():
        content_hashes = hashutil.MultiHash.from_data(content).digest()
        contents.append(
            Content(
                data=content,
                length=len(content),
                status="visible",
                sha1=hash_to_bytes(obj_id),
                sha1_git=hash_to_bytes(obj_id),
                sha256=content_hashes["sha256"],
                blake2s256=content_hashes["blake2s256"],
            )
        )
    storage.content_add(contents)
コード例 #13
0
ファイル: test_origin.py プロジェクト: shivam2003sy/swh-web
def test_origin_browse_directory_branch_with_non_resolvable_revision(
    client, archive_data, new_origin, unknown_revision
):
    branch_name = "master"
    snapshot = Snapshot(
        branches={
            branch_name.encode(): SnapshotBranch(
                target=hash_to_bytes(unknown_revision), target_type=TargetType.REVISION,
            )
        }
    )
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([snapshot])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=now(), type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="partial",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "browse-origin-directory",
        query_params={"origin_url": new_origin.url, "branch": branch_name},
    )

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/directory.html"
    )
    assert_contains(
        resp, f"Revision {unknown_revision } could not be found in the archive."
    )
コード例 #14
0
    def test_content_get_partition_murmur3_collision(self, swh_storage, mocker,
                                                     sample_data):
        """The Murmur3 token is used as link from index tables to the main table; and
        non-matching contents with colliding murmur3-hash are filtered-out when reading
        the main table.

        This test checks the content_get_partition endpoints return all contents, even
        the collisions.

        """
        called = 0

        rows: Dict[int, Dict] = {}
        for tok, content in enumerate(sample_data.contents):
            cont = attr.evolve(content, data=None, ctime=now())
            row_d = {**cont.to_dict(), "tok": tok}
            rows[tok] = row_d

        # For all tokens, always return cont

        def mock_content_get_token_range(range_start, range_end, limit):
            nonlocal called
            called += 1

            for tok in list(
                    rows.keys()) * 3:  # yield multiple times the same tok
                row_d = dict(rows[tok].items())
                row_d.pop("tok")
                yield (tok, ContentRow(**row_d))

        mocker.patch.object(
            swh_storage._cql_runner,
            "content_get_token_range",
            mock_content_get_token_range,
        )

        actual_results = list(
            stream_results(swh_storage.content_get_partition,
                           partition_id=0,
                           nb_partitions=1))

        assert called > 0

        # everything is listed, even collisions
        assert len(actual_results) == 3 * len(sample_data.contents)
        # as we duplicated the returned results, dropping duplicate should yield
        # the original length
        assert len(set(actual_results)) == len(sample_data.contents)
コード例 #15
0
    def test_content_find_murmur3_collision(self, swh_storage, mocker,
                                            sample_data):
        """The Murmur3 token is used as link from index tables to the main
        table; and non-matching contents with colliding murmur3-hash
        are filtered-out when reading the main table.
        This test checks the content methods do filter out these collisions.
        """
        called = 0

        cont, cont2 = [
            attr.evolve(c, ctime=now()) for c in sample_data.contents[:2]
        ]

        # always return a token
        def mock_cgtfsa(algo, hashes):
            nonlocal called
            called += 1
            assert algo in ("sha1", "sha1_git")
            return [123456]

        mocker.patch.object(
            swh_storage._cql_runner,
            "content_get_tokens_from_single_algo",
            mock_cgtfsa,
        )

        # For all tokens, always return cont and cont2
        cols = list(set(cont.to_dict()) - {"data"})

        def mock_cgft(tokens):
            nonlocal called
            called += 1
            return [
                ContentRow(**{col: getattr(cont, col)
                              for col in cols}) for cont in [cont, cont2]
            ]

        mocker.patch.object(swh_storage._cql_runner, "content_get_from_tokens",
                            mock_cgft)

        expected_content = attr.evolve(cont, data=None)

        actual_result = swh_storage.content_find({"sha1": cont.sha1})

        assert called == 2

        # but cont2 should be filtered out
        assert actual_result == [expected_content]
コード例 #16
0
ファイル: test_origin.py プロジェクト: shivam2003sy/swh-web
def test_api_lookup_origin_visit_latest_with_snapshot(api_client, archive_data,
                                                      new_origin, visit_dates,
                                                      new_snapshots):
    archive_data.origin_add([new_origin])
    visit_dates.sort()
    visit_ids = []
    for i, visit_date in enumerate(visit_dates):
        origin_visit = archive_data.origin_visit_add([
            OriginVisit(
                origin=new_origin.url,
                date=visit_date,
                type="git",
            )
        ])[0]
        visit_ids.append(origin_visit.visit)

    archive_data.snapshot_add([new_snapshots[0]])

    # Add snapshot to the latest visit
    visit_id = visit_ids[-1]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit_id,
        date=now(),
        status="full",
        snapshot=new_snapshots[0].id,
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse(
        "api-1-origin-visit-latest",
        url_args={"origin_url": new_origin.url},
        query_params={"require_snapshot": True},
    )

    rv = check_api_get_responses(api_client, url, status_code=200)

    expected_visit = archive_data.origin_visit_status_get_latest(
        new_origin.url, type="git", require_snapshot=True)

    expected_visit = enrich_origin_visit(
        expected_visit,
        with_origin_link=True,
        with_origin_visit_link=False,
        request=rv.wsgi_request,
    )

    assert rv.data == expected_visit
コード例 #17
0
def test_retrying_proxy_storage_content_add_metadata(swh_storage, sample_data):
    """Standard content_add_metadata works as before"""
    sample_content = sample_data.content
    content = attr.evolve(sample_content, data=None)

    pk = content.sha1
    content_metadata = swh_storage.content_get([pk])
    assert content_metadata == [None]

    s = swh_storage.content_add_metadata([attr.evolve(content, ctime=now())])
    assert s == {
        "content:add": 1,
    }

    content_metadata = swh_storage.content_get([pk])
    assert len(content_metadata) == 1
    assert content_metadata[0].sha1 == pk
コード例 #18
0
ファイル: test_origin.py プロジェクト: shivam2003sy/swh-web
def test_origin_branches_pagination_with_alias(
    client, archive_data, mocker, new_origin, visit_dates, revisions, existing_release
):
    """
    When a snapshot contains a branch or a release alias, pagination links
    in the branches / releases view should be displayed.
    """
    mocker.patch("swh.web.browse.snapshot_context.PER_PAGE", len(revisions) / 2)
    snp_dict = {"branches": {}, "id": hash_to_bytes(random_sha1())}
    for i in range(len(revisions)):
        branch = "".join(random.choices(string.ascii_lowercase, k=8))
        snp_dict["branches"][branch.encode()] = {
            "target_type": "revision",
            "target": hash_to_bytes(revisions[i]),
        }
    release = "".join(random.choices(string.ascii_lowercase, k=8))
    snp_dict["branches"][b"RELEASE_ALIAS"] = {
        "target_type": "alias",
        "target": release.encode(),
    }
    snp_dict["branches"][release.encode()] = {
        "target_type": "release",
        "target": hash_to_bytes(existing_release),
    }
    archive_data.origin_add([new_origin])
    archive_data.snapshot_add([Snapshot.from_dict(snp_dict)])
    visit = archive_data.origin_visit_add(
        [OriginVisit(origin=new_origin.url, date=visit_dates[0], type="git",)]
    )[0]
    visit_status = OriginVisitStatus(
        origin=new_origin.url,
        visit=visit.visit,
        date=now(),
        status="full",
        snapshot=snp_dict["id"],
    )
    archive_data.origin_visit_status_add([visit_status])

    url = reverse("browse-origin-branches", query_params={"origin_url": new_origin.url})

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/branches.html"
    )
    assert_contains(resp, '<ul class="pagination')
コード例 #19
0
 def test_pypi_missing_branch(self):
     origin_url = "https://pypi.org/project/abcdef/"
     self.indexer.storage.origin_add([Origin(url=origin_url, )])
     visit = self.indexer.storage.origin_visit_add([
         OriginVisit(
             origin=origin_url,
             date=datetime(2019, 2, 27, tzinfo=timezone.utc),
             type="pypi",
         )
     ])[0]
     self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
     visit_status = OriginVisitStatus(
         origin=origin_url,
         visit=visit.visit,
         date=now(),
         status="full",
         snapshot=SAMPLE_SNAPSHOT.id,
     )
     self.indexer.storage.origin_visit_status_add([visit_status])
     self.indexer.run(["https://pypi.org/project/abcdef/"])
     self.assertEqual(self.indexer.results, [])
コード例 #20
0
def test_origin_get_latest_visit_status_filter_snapshot(
        swh_storage, sample_data):
    objects = init_storage_with_origin_visits(swh_storage, sample_data)
    origin1, origin2 = objects["origin"]
    _, ov2 = objects["origin_visit"]
    _, _, _, ovs22 = objects["origin_visit_status"]

    # there is no visit with snapshot yet for that visit
    assert (origin_get_latest_visit_status(
        swh_storage, origin1.url, require_snapshot=True) is None)

    # visit status with partial status visit elected
    actual_ovs22 = origin_get_latest_visit_status(swh_storage,
                                                  origin2.url,
                                                  require_snapshot=True)
    assert actual_ovs22 == ovs22
    assert actual_ovs22.origin == ov2.origin
    assert actual_ovs22.visit == ov2.visit
    assert actual_ovs22.type == ov2.type

    date_now = now()

    # Add another visit
    swh_storage.origin_visit_add([
        OriginVisit(
            origin=origin2.url,
            date=date_now,
            type=sample_data.type_visit2,
        ),
    ])

    # Requiring the latest visit with a snapshot, we still find the previous visit
    ovs22 = origin_get_latest_visit_status(swh_storage,
                                           origin2.url,
                                           require_snapshot=True)
    assert actual_ovs22 == ovs22
    assert actual_ovs22.origin == ov2.origin
    assert actual_ovs22.visit == ov2.visit
    assert actual_ovs22.type == ov2.type
コード例 #21
0
 def test_git_partial_snapshot(self):
     """Checks partial snapshots are ignored."""
     origin_url = "https://github.com/SoftwareHeritage/swh-core"
     self.indexer.storage.origin_add([Origin(url=origin_url)])
     visit = self.indexer.storage.origin_visit_add([
         OriginVisit(
             origin=origin_url,
             date=datetime(2019, 2, 27, tzinfo=timezone.utc),
             type="git",
         )
     ])[0]
     self.indexer.storage.snapshot_add([SAMPLE_SNAPSHOT])
     visit_status = OriginVisitStatus(
         origin=origin_url,
         visit=visit.visit,
         date=now(),
         status="partial",
         snapshot=SAMPLE_SNAPSHOT.id,
     )
     self.indexer.storage.origin_visit_status_add([visit_status])
     self.indexer.run([origin_url])
     self.assertEqual(self.indexer.results, [])
コード例 #22
0
def init_storage_with_origin_visits(swh_storage, sample_data):
    """Initialize storage with origin/origin-visit/origin-visit-status"""
    snapshot = sample_data.snapshots[2]
    origin1, origin2 = sample_data.origins[:2]
    swh_storage.origin_add([origin1, origin2])

    ov1, ov2 = swh_storage.origin_visit_add([
        OriginVisit(
            origin=origin1.url,
            date=sample_data.date_visit1,
            type=sample_data.type_visit1,
        ),
        OriginVisit(
            origin=origin2.url,
            date=sample_data.date_visit2,
            type=sample_data.type_visit2,
        ),
    ])

    swh_storage.snapshot_add([snapshot])

    date_now = now()
    date_now = round_to_milliseconds(date_now)
    assert sample_data.date_visit1 < sample_data.date_visit2
    assert sample_data.date_visit2 < date_now

    # origin visit status 1 for origin visit 1
    ovs11 = OriginVisitStatus(
        origin=ov1.origin,
        visit=ov1.visit,
        date=ov1.date + datetime.timedelta(seconds=10),  # so it's not ignored
        type=ov1.type,
        status="partial",
        snapshot=None,
    )
    # origin visit status 2 for origin visit 1
    ovs12 = OriginVisitStatus(
        origin=ov1.origin,
        visit=ov1.visit,
        date=sample_data.date_visit2,
        type=ov1.type,
        status="ongoing",
        snapshot=None,
    )
    # origin visit status 1 for origin visit 2
    ovs21 = OriginVisitStatus(
        origin=ov2.origin,
        visit=ov2.visit,
        date=ov2.date + datetime.timedelta(seconds=10),  # so it's not ignored
        type=ov2.type,
        status="ongoing",
        snapshot=None,
    )
    # origin visit status 2 for origin visit 2
    ovs22 = OriginVisitStatus(
        origin=ov2.origin,
        visit=ov2.visit,
        date=date_now,
        type=ov2.type,
        status="full",
        snapshot=snapshot.id,
        metadata={"something": "wicked"},
    )

    swh_storage.origin_visit_status_add([ovs11, ovs12, ovs21, ovs22])
    return {
        "origin": [origin1, origin2],
        "origin_visit": [ov1, ov2],
        "origin_visit_status": [ovs11, ovs12, ovs21, ovs22],
    }
コード例 #23
0
    return d


testdata = [
    pytest.param(
        "content",
        "content_add",
        list(TEST_OBJECTS["content"]),
        attr.evolve(model.Content.from_data(data=b"too big"), length=1000),
        attr.evolve(model.Content.from_data(data=b"to fail"), length=1000),
        id="content",
    ),
    pytest.param(
        "content",
        "content_add_metadata",
        [attr.evolve(cnt, ctime=now()) for cnt in TEST_OBJECTS["content"]],
        attr.evolve(model.Content.from_data(data=b"too big"), length=1000, ctime=now()),
        attr.evolve(model.Content.from_data(data=b"to fail"), length=1000, ctime=now()),
        id="content_metadata",
    ),
    pytest.param(
        "skipped_content",
        "skipped_content_add",
        list(TEST_OBJECTS["skipped_content"]),
        attr.evolve(
            model.SkippedContent.from_data(data=b"too big", reason="too big"),
            length=1000,
        ),
        attr.evolve(
            model.SkippedContent.from_data(data=b"to fail", reason="to fail"),
            length=1000,
コード例 #24
0
def test_snapshot_get_latest(swh_storage, sample_data):
    origin = sample_data.origin
    swh_storage.origin_add([origin])

    visit1, visit2 = sample_data.origin_visits[:2]
    assert visit1.origin == origin.url

    swh_storage.origin_visit_add([visit1])
    ov1 = swh_storage.origin_visit_get_latest(origin.url)

    # Add snapshot to visit1, latest snapshot = visit 1 snapshot
    complete_snapshot = sample_data.snapshots[2]
    swh_storage.snapshot_add([complete_snapshot])

    swh_storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=origin.url,
            visit=ov1.visit,
            date=visit2.date,
            status="partial",
            snapshot=None,
        )
    ])
    assert visit1.date < visit2.date

    # no snapshot associated to the visit, so None
    actual_snapshot = snapshot_get_latest(swh_storage,
                                          origin.url,
                                          allowed_statuses=["partial"])
    assert actual_snapshot is None

    date_now = now()
    assert visit2.date < date_now
    swh_storage.origin_visit_status_add([
        OriginVisitStatus(
            origin=ov1.origin,
            visit=ov1.visit,
            date=date_now,
            type=ov1.type,
            status="full",
            snapshot=complete_snapshot.id,
        )
    ])

    swh_storage.origin_visit_add(
        [OriginVisit(
            origin=origin.url,
            date=now(),
            type=visit1.type,
        )])

    actual_snapshot = snapshot_get_latest(swh_storage, origin.url)
    assert actual_snapshot is not None
    assert actual_snapshot == complete_snapshot

    actual_snapshot = snapshot_get_latest(swh_storage,
                                          origin.url,
                                          branches_count=1)
    assert actual_snapshot is not None
    assert actual_snapshot.id == complete_snapshot.id
    assert len(actual_snapshot.branches.values()) == 1

    with pytest.raises(ValueError,
                       match="branches_count must be a positive integer"):
        snapshot_get_latest(swh_storage,
                            origin.url,
                            branches_count="something-wrong")
コード例 #25
0
    def load(self) -> Dict[str, str]:
        r"""Loading logic for the loader to follow:

        - Store the actual ``origin_visit`` to storage
        - Call :meth:`prepare` to prepare any eventual state
        - Call :meth:`get_origin` to get the origin we work with and store

        - while True:

          - Call :meth:`fetch_data` to fetch the data to store
          - Call :meth:`process_data` to optionally run processing between
            :meth:`fetch_data` and :meth:`store_data`
          - Call :meth:`store_data` to store the data

        - Call :meth:`cleanup` to clean up any eventual state put in place
             in :meth:`prepare` method.

        """
        try:
            with self.statsd_timed("pre_cleanup"):
                self.pre_cleanup()
        except Exception:
            msg = "Cleaning up dangling data failed! Continue loading."
            self.log.warning(msg)
            sentry_sdk.capture_exception()

        self._store_origin_visit()

        assert (
            self.visit.visit
        ), "The method `_store_origin_visit` should set the visit (OriginVisit)"
        self.log.info("Load origin '%s' with type '%s'", self.origin.url,
                      self.visit.type)

        try:
            with self.statsd_timed("build_extrinsic_origin_metadata"):
                metadata = self.build_extrinsic_origin_metadata()
            self.load_metadata_objects(metadata)
        except Exception as e:
            sentry_sdk.capture_exception(e)
            # Do not fail the whole task if this is the only failure
            self.log.exception(
                "Failure while loading extrinsic origin metadata.",
                extra={
                    "swh_task_args": [],
                    "swh_task_kwargs": {
                        "origin": self.origin.url,
                        "lister_name": self.lister_name,
                        "lister_instance_name": self.lister_instance_name,
                    },
                },
            )

        total_time_fetch_data = 0.0
        total_time_process_data = 0.0
        total_time_store_data = 0.0

        try:
            # Initially not a success, will be True when actually one
            success = False
            with self.statsd_timed("prepare"):
                self.prepare()

            while True:
                t1 = time.monotonic()
                more_data_to_fetch = self.fetch_data()
                t2 = time.monotonic()
                total_time_fetch_data += t2 - t1

                more_data_to_fetch = self.process_data() and more_data_to_fetch
                t3 = time.monotonic()
                total_time_process_data += t3 - t2

                self.store_data()
                t4 = time.monotonic()
                total_time_store_data += t4 - t3
                if not more_data_to_fetch:
                    break

            self.statsd_timing("fetch_data", total_time_fetch_data * 1000.0)
            self.statsd_timing("process_data",
                               total_time_process_data * 1000.0)
            self.statsd_timing("store_data", total_time_store_data * 1000.0)

            status = self.visit_status()
            visit_status = OriginVisitStatus(
                origin=self.origin.url,
                visit=self.visit.visit,
                type=self.visit_type,
                date=now(),
                status=status,
                snapshot=self.loaded_snapshot_id,
            )
            self.storage.origin_visit_status_add([visit_status])
            success = True
            with self.statsd_timed("post_load",
                                   tags={
                                       "success": success,
                                       "status": status
                                   }):
                self.post_load()
        except BaseException as e:
            success = False
            if isinstance(e, NotFound):
                status = "not_found"
                task_status = "uneventful"
            else:
                status = "partial" if self.loaded_snapshot_id else "failed"
                task_status = "failed"

            self.log.exception(
                "Loading failure, updating to `%s` status",
                status,
                extra={
                    "swh_task_args": [],
                    "swh_task_kwargs": {
                        "origin": self.origin.url,
                        "lister_name": self.lister_name,
                        "lister_instance_name": self.lister_instance_name,
                    },
                },
            )
            if not isinstance(e, (SystemExit, KeyboardInterrupt)):
                sentry_sdk.capture_exception()
            visit_status = OriginVisitStatus(
                origin=self.origin.url,
                visit=self.visit.visit,
                type=self.visit_type,
                date=now(),
                status=status,
                snapshot=self.loaded_snapshot_id,
            )
            self.storage.origin_visit_status_add([visit_status])
            with self.statsd_timed("post_load",
                                   tags={
                                       "success": success,
                                       "status": status
                                   }):
                self.post_load(success=success)
            if not isinstance(e, Exception):
                # e derives from BaseException but not Exception; this is most likely
                # SystemExit or KeyboardInterrupt, so we should re-raise it.
                raise
            return {"status": task_status}
        finally:
            with self.statsd_timed("flush",
                                   tags={
                                       "success": success,
                                       "status": status
                                   }):
                self.flush()
            with self.statsd_timed("cleanup",
                                   tags={
                                       "success": success,
                                       "status": status
                                   }):
                self.cleanup()

        return self.load_status()
コード例 #26
0
def test_sub_directory_view_origin_context(client, archive_data,
                                           empty_directory, person, date):
    origin_url = "test_sub_directory_view_origin_context"
    subdir = Directory(entries=(
        DirectoryEntry(
            name=b"foo",
            type="dir",
            target=hash_to_bytes(empty_directory),
            perms=DentryPerms.directory,
        ),
        DirectoryEntry(
            name=b"bar",
            type="dir",
            target=hash_to_bytes(empty_directory),
            perms=DentryPerms.directory,
        ),
    ))

    parentdir = Directory(entries=(DirectoryEntry(
        name=b"baz",
        type="dir",
        target=subdir.id,
        perms=DentryPerms.directory,
    ), ))
    archive_data.directory_add([subdir, parentdir])

    revision = Revision(
        directory=parentdir.id,
        author=person,
        committer=person,
        message=b"commit message",
        date=TimestampWithTimezone.from_datetime(date),
        committer_date=TimestampWithTimezone.from_datetime(date),
        synthetic=False,
        type=RevisionType.GIT,
    )
    archive_data.revision_add([revision])

    snapshot = Snapshot(
        branches={
            b"HEAD":
            SnapshotBranch(
                target="refs/head/master".encode(),
                target_type=TargetType.ALIAS,
            ),
            b"refs/head/master":
            SnapshotBranch(
                target=revision.id,
                target_type=TargetType.REVISION,
            ),
        })
    archive_data.snapshot_add([snapshot])

    archive_data.origin_add([Origin(url=origin_url)])
    date = now()
    visit = OriginVisit(origin=origin_url, date=date, type="git")
    visit = archive_data.origin_visit_add([visit])[0]
    visit_status = OriginVisitStatus(
        origin=origin_url,
        visit=visit.visit,
        date=date,
        status="full",
        snapshot=snapshot.id,
    )
    archive_data.origin_visit_status_add([visit_status])

    dir_content = archive_data.directory_ls(hash_to_hex(parentdir.id))
    subdir = dir_content[0]
    subdir_content = archive_data.directory_ls(subdir["target"])
    _directory_view_checks(
        client,
        hash_to_hex(parentdir.id),
        subdir_content,
        subdir["name"],
        origin_url,
        hash_to_hex(snapshot.id),
        hash_to_hex(revision.id),
    )