Beispiel #1
0
def _get_visit_info_for_save_request(save_request):
    visit_date = None
    visit_status = None
    time_now = datetime.now(tz=timezone.utc)
    time_delta = time_now - save_request.request_date
    # stop trying to find a visit date one month after save request submission
    # as those requests to storage are expensive and associated loading task
    # surely ended up with errors
    if time_delta.days <= 30:
        try:
            origin = {"url": save_request.origin_url}
            origin_info = archive.lookup_origin(origin)
            origin_visits = get_origin_visits(origin_info)
            visit_dates = [
                parse_iso8601_date_to_utc(v["date"]) for v in origin_visits
            ]
            i = bisect_right(visit_dates, save_request.request_date)
            if i != len(visit_dates):
                visit_date = visit_dates[i]
                visit_status = origin_visits[i]["status"]
                if origin_visits[i]["status"] not in ("full", "partial",
                                                      "not_found"):
                    visit_date = None
        except Exception as exc:
            sentry_sdk.capture_exception(exc)
    return visit_date, visit_status
Beispiel #2
0
def _origin_visits_browse(request, origin_url):
    if origin_url is None:
        raise BadInputExc("An origin URL must be provided as query parameter.")

    origin_info = archive.lookup_origin({"url": origin_url})
    origin_visits = get_origin_visits(origin_info)
    snapshot_context = get_snapshot_context(origin_url=origin_url)

    for i, visit in enumerate(origin_visits):
        url_date = format_utc_iso_date(visit["date"], "%Y-%m-%dT%H:%M:%SZ")
        visit["formatted_date"] = format_utc_iso_date(visit["date"])
        query_params = {"origin_url": origin_url, "timestamp": url_date}
        if i < len(origin_visits) - 1:
            if visit["date"] == origin_visits[i + 1]["date"]:
                query_params = {"visit_id": visit["visit"]}
        if i > 0:
            if visit["date"] == origin_visits[i - 1]["date"]:
                query_params = {"visit_id": visit["visit"]}

        snapshot = visit["snapshot"] if visit["snapshot"] else ""

        visit["url"] = reverse(
            "browse-origin-directory",
            query_params=query_params,
        )
        if not snapshot:
            visit["snapshot"] = ""
        visit["date"] = parse_iso8601_date_to_utc(visit["date"]).timestamp()

    heading = "Origin visits - %s" % origin_url

    return render(
        request,
        "browse/origin-visits.html",
        {
            "heading": heading,
            "swh_object_name": "Visits",
            "swh_object_metadata": origin_info,
            "origin_visits": origin_visits,
            "origin_info": origin_info,
            "snapshot_context": snapshot_context,
            "vault_cooking": None,
            "show_actions": False,
        },
    )
Beispiel #3
0
def _stat_counters(request):
    stat_counters = archive.stat_counters()
    url = get_config()["history_counters_url"]
    stat_counters_history = {}
    try:
        response = requests.get(url, timeout=5)
        stat_counters_history = json.loads(response.text)
        for d, object_counts in _stat_counters_backfill.items():
            # convert date to javascript timestamp (in ms)
            timestamp = int(parse_iso8601_date_to_utc(d).timestamp()) * 1000
            for object_type, object_count in object_counts.items():
                stat_counters_history[object_type].append(
                    [timestamp, object_count])
    except Exception as exc:
        sentry_sdk.capture_exception(exc)

    counters = {
        "stat_counters": stat_counters,
        "stat_counters_history": stat_counters_history,
    }
    return JsonResponse(counters)
Beispiel #4
0
def get_origin_visit(
    origin_info: OriginInfo,
    visit_ts: Optional[str] = None,
    visit_id: Optional[int] = None,
    snapshot_id: Optional[str] = None,
) -> OriginVisitInfo:
    """Function that returns information about a visit for a given origin.

    If a timestamp is provided, the closest visit from that
    timestamp is returned.

    If a snapshot identifier is provided, the first visit with that snapshot
    is returned.

    If no search hints are provided, return the most recent full visit with
    a valid snapshot or the most recent partial visit with a valid snapshot
    otherwise.

    Args:
        origin_info: a dict filled with origin information
        visit_ts: an ISO 8601 datetime string to parse
        snapshot_id: a snapshot identifier

    Returns:
        A dict containing the visit info.

    Raises:
        swh.web.common.exc.NotFoundExc: if no visit can be found
    """
    # returns the latest full visit with a valid snapshot
    visit = archive.lookup_origin_visit_latest(origin_info["url"],
                                               allowed_statuses=["full"],
                                               require_snapshot=True)
    if not visit:
        # or the latest partial visit with a valid snapshot otherwise
        visit = archive.lookup_origin_visit_latest(
            origin_info["url"],
            allowed_statuses=["partial"],
            require_snapshot=True)

    if not visit_ts and not visit_id and not snapshot_id:
        if visit:
            return visit
        else:
            raise NotFoundExc(
                f"No valid visit for origin with url {origin_info['url']} found!"
            )

    # no need to fetch all visits list and search in it if the latest
    # visit matches some criteria
    if visit and (visit["snapshot"] == snapshot_id
                  or visit["visit"] == visit_id):
        return visit

    visits = get_origin_visits(origin_info)

    if not visits:
        raise NotFoundExc(
            f"No visits associated to origin with url {origin_info['url']}!")

    if snapshot_id:
        visits = [v for v in visits if v["snapshot"] == snapshot_id]
        if len(visits) == 0:
            raise NotFoundExc(
                ("Visit for snapshot with id %s for origin with"
                 " url %s not found!" % (snapshot_id, origin_info["url"])))
        return visits[0]

    if visit_id:
        visits = [v for v in visits if v["visit"] == int(visit_id)]
        if len(visits) == 0:
            raise NotFoundExc(
                ("Visit with id %s for origin with"
                 " url %s not found!" % (visit_id, origin_info["url"])))
        return visits[0]

    if visit_ts:

        target_visit_ts = math.floor(
            parse_iso8601_date_to_utc(visit_ts).timestamp())

        # Find the visit with date closest to the target (in absolute value)
        (abs_time_delta, visit_idx) = min(
            ((math.floor(parse_iso8601_date_to_utc(
                visit["date"]).timestamp()), i)
             for (i, visit) in enumerate(visits)),
            key=lambda ts_and_i: abs(ts_and_i[0] - target_visit_ts),
        )

        if visit_idx is not None:
            visit = visits[visit_idx]
            # If multiple visits have the same date, select the one with
            # the largest id.
            while (visit_idx < len(visits) - 1
                   and visit["date"] == visits[visit_idx + 1]["date"]):
                visit_idx = visit_idx + 1
                visit = visits[visit_idx]
            return visit
        else:
            raise NotFoundExc(
                ("Visit with timestamp %s for origin with "
                 "url %s not found!" % (visit_ts, origin_info["url"])))
    return visits[-1]
Beispiel #5
0
 def _visit_sort_key(visit):
     ts = parse_iso8601_date_to_utc(visit["date"]).timestamp()
     return ts + (float(visit["visit"]) / 10e3)
Beispiel #6
0
def _origin_directory_view_test_helper(
    client,
    archive_data,
    origin_info,
    origin_visit,
    snapshot_sizes,
    origin_branches,
    origin_releases,
    root_directory_sha1,
    directory_entries,
    visit_id=None,
    timestamp=None,
    snapshot_id=None,
    path=None,
):
    dirs = [e for e in directory_entries if e["type"] in ("dir", "rev")]
    files = [e for e in directory_entries if e["type"] == "file"]

    if not visit_id and not snapshot_id:
        visit_id = origin_visit["visit"]

    query_params = {"origin_url": origin_info["url"]}

    if timestamp:
        query_params["timestamp"] = timestamp
    elif visit_id:
        query_params["visit_id"] = visit_id
    else:
        query_params["snapshot"] = snapshot_id

    if path:
        query_params["path"] = path

    url = reverse("browse-origin-directory", query_params=query_params)

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/directory.html"
    )
    assert_contains(resp, '<td class="swh-directory">', count=len(dirs))
    assert_contains(resp, '<td class="swh-content">', count=len(files))

    if timestamp:
        query_params["timestamp"] = format_utc_iso_date(
            parse_iso8601_date_to_utc(timestamp).isoformat(), "%Y-%m-%dT%H:%M:%SZ"
        )

    for d in dirs:
        if d["type"] == "rev":
            dir_url = reverse("browse-revision", url_args={"sha1_git": d["target"]})
        else:
            dir_path = d["name"]
            if path:
                dir_path = "%s/%s" % (path, d["name"])
            query_params["path"] = dir_path
            dir_url = reverse("browse-origin-directory", query_params=query_params,)
        assert_contains(resp, dir_url)

    for f in files:
        file_path = f["name"]
        if path:
            file_path = "%s/%s" % (path, f["name"])
        query_params["path"] = file_path
        file_url = reverse("browse-origin-content", query_params=query_params)
        assert_contains(resp, file_url)

    if "path" in query_params:
        del query_params["path"]

    root_dir_branch_url = reverse("browse-origin-directory", query_params=query_params)

    nb_bc_paths = 1
    if path:
        nb_bc_paths = len(path.split("/")) + 1

    assert_contains(resp, '<li class="swh-path">', count=nb_bc_paths)
    assert_contains(
        resp, '<a href="%s">%s</a>' % (root_dir_branch_url, root_directory_sha1[:7])
    )

    origin_branches_url = reverse("browse-origin-branches", query_params=query_params)

    assert_contains(resp, f'href="{escape(origin_branches_url)}"')
    assert_contains(resp, f"Branches ({snapshot_sizes['revision']})")

    origin_releases_url = reverse("browse-origin-releases", query_params=query_params)

    nb_releases = len(origin_releases)
    if nb_releases > 0:
        assert_contains(resp, f'href="{escape(origin_releases_url)}"')
        assert_contains(resp, f"Releases ({snapshot_sizes['release']})")

    if path:
        query_params["path"] = path

    assert_contains(resp, '<li class="swh-branch">', count=len(origin_branches))

    for branch in origin_branches:
        query_params["branch"] = branch["name"]
        root_dir_branch_url = reverse(
            "browse-origin-directory", query_params=query_params
        )

        assert_contains(resp, '<a href="%s">' % root_dir_branch_url)

    assert_contains(resp, '<li class="swh-release">', count=len(origin_releases))

    query_params["branch"] = None
    for release in origin_releases:
        query_params["release"] = release["name"]
        root_dir_release_url = reverse(
            "browse-origin-directory", query_params=query_params
        )

        assert_contains(resp, 'href="%s"' % root_dir_release_url)

    assert_contains(resp, "vault-cook-directory")
    assert_contains(resp, "vault-cook-revision")

    snapshot = archive_data.snapshot_get(origin_visit["snapshot"])
    head_rev_id = archive_data.snapshot_get_head(snapshot)

    swhid_context = {
        "origin": origin_info["url"],
        "visit": gen_swhid(SNAPSHOT, snapshot["id"]),
        "anchor": gen_swhid(REVISION, head_rev_id),
        "path": f"/{path}" if path else None,
    }

    swh_dir_id = gen_swhid(
        DIRECTORY, directory_entries[0]["dir_id"], metadata=swhid_context
    )
    swh_dir_id_url = reverse("browse-swhid", url_args={"swhid": swh_dir_id})
    assert_contains(resp, swh_dir_id)
    assert_contains(resp, swh_dir_id_url)

    assert_contains(resp, "swh-take-new-snapshot")

    _check_origin_link(resp, origin_info["url"])

    assert_not_contains(resp, "swh-metadata-popover")
Beispiel #7
0
def _origin_content_view_test_helper(
    client,
    archive_data,
    origin_info,
    origin_visit,
    snapshot_sizes,
    origin_branches,
    origin_releases,
    root_dir_sha1,
    content,
    visit_id=None,
    timestamp=None,
    snapshot_id=None,
):
    content_path = "/".join(content["path"].split("/")[1:])

    if not visit_id and not snapshot_id:
        visit_id = origin_visit["visit"]

    query_params = {"origin_url": origin_info["url"], "path": content_path}

    if timestamp:
        query_params["timestamp"] = timestamp

    if visit_id:
        query_params["visit_id"] = visit_id
    elif snapshot_id:
        query_params["snapshot"] = snapshot_id

    url = reverse("browse-origin-content", query_params=query_params)

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/content.html"
    )

    assert type(content["data"]) == str

    assert_contains(resp, '<code class="%s">' % content["hljs_language"])
    assert_contains(resp, escape(content["data"]))

    split_path = content_path.split("/")

    filename = split_path[-1]
    path = content_path.replace(filename, "")[:-1]

    path_info = gen_path_info(path)

    del query_params["path"]

    if timestamp:
        query_params["timestamp"] = format_utc_iso_date(
            parse_iso8601_date_to_utc(timestamp).isoformat(), "%Y-%m-%dT%H:%M:%SZ"
        )

    root_dir_url = reverse("browse-origin-directory", query_params=query_params)

    assert_contains(resp, '<li class="swh-path">', count=len(path_info) + 1)

    assert_contains(resp, '<a href="%s">%s</a>' % (root_dir_url, root_dir_sha1[:7]))

    for p in path_info:
        query_params["path"] = p["path"]
        dir_url = reverse("browse-origin-directory", query_params=query_params)
        assert_contains(resp, '<a href="%s">%s</a>' % (dir_url, p["name"]))

    assert_contains(resp, "<li>%s</li>" % filename)

    query_string = "sha1_git:" + content["sha1_git"]

    url_raw = reverse(
        "browse-content-raw",
        url_args={"query_string": query_string},
        query_params={"filename": filename},
    )
    assert_contains(resp, url_raw)

    if "path" in query_params:
        del query_params["path"]

    origin_branches_url = reverse("browse-origin-branches", query_params=query_params)

    assert_contains(resp, f'href="{escape(origin_branches_url)}"')
    assert_contains(resp, f"Branches ({snapshot_sizes['revision']})")

    origin_releases_url = reverse("browse-origin-releases", query_params=query_params)

    assert_contains(resp, f'href="{escape(origin_releases_url)}">')
    assert_contains(resp, f"Releases ({snapshot_sizes['release']})")

    assert_contains(resp, '<li class="swh-branch">', count=len(origin_branches))

    query_params["path"] = content_path

    for branch in origin_branches:
        root_dir_branch_url = reverse(
            "browse-origin-content",
            query_params={"branch": branch["name"], **query_params},
        )

        assert_contains(resp, '<a href="%s">' % root_dir_branch_url)

    assert_contains(resp, '<li class="swh-release">', count=len(origin_releases))

    query_params["branch"] = None
    for release in origin_releases:
        root_dir_release_url = reverse(
            "browse-origin-content",
            query_params={"release": release["name"], **query_params},
        )

        assert_contains(resp, '<a href="%s">' % root_dir_release_url)

    url = reverse("browse-origin-content", query_params=query_params)

    resp = check_html_get_response(
        client, url, status_code=200, template_used="browse/content.html"
    )

    snapshot = archive_data.snapshot_get(origin_visit["snapshot"])
    head_rev_id = archive_data.snapshot_get_head(snapshot)

    swhid_context = {
        "origin": origin_info["url"],
        "visit": gen_swhid(SNAPSHOT, snapshot["id"]),
        "anchor": gen_swhid(REVISION, head_rev_id),
        "path": f"/{content_path}",
    }

    swh_cnt_id = gen_swhid(CONTENT, content["sha1_git"], metadata=swhid_context)
    swh_cnt_id_url = reverse("browse-swhid", url_args={"swhid": swh_cnt_id})
    assert_contains(resp, swh_cnt_id)
    assert_contains(resp, swh_cnt_id_url)

    assert_contains(resp, "swh-take-new-snapshot")

    _check_origin_link(resp, origin_info["url"])

    assert_not_contains(resp, "swh-metadata-popover")
Beispiel #8
0
def test_revision_log_browse(client, archive_data, revision):
    per_page = 10

    revision_log = archive_data.revision_log(revision)

    revision_log_sorted = sorted(
        revision_log,
        key=lambda rev: -parse_iso8601_date_to_utc(rev["committer_date"]).
        timestamp(),
    )

    url = reverse(
        "browse-revision-log",
        url_args={"sha1_git": revision},
        query_params={"per_page": per_page},
    )

    next_page_url = reverse(
        "browse-revision-log",
        url_args={"sha1_git": revision},
        query_params={
            "offset": per_page,
            "per_page": per_page,
        },
    )

    nb_log_entries = per_page
    if len(revision_log_sorted) < per_page:
        nb_log_entries = len(revision_log_sorted)

    resp = check_html_get_response(client,
                                   url,
                                   status_code=200,
                                   template_used="browse/revision-log.html")
    assert_contains(resp,
                    '<tr class="swh-revision-log-entry',
                    count=nb_log_entries)
    assert_contains(resp, '<a class="page-link">Newer</a>')

    if len(revision_log_sorted) > per_page:
        assert_contains(
            resp,
            '<a class="page-link" href="%s">Older</a>' % escape(next_page_url),
        )

    for log in revision_log_sorted[:per_page]:
        revision_url = reverse("browse-revision",
                               url_args={"sha1_git": log["id"]})
        assert_contains(resp, log["id"][:7])
        assert_contains(resp, log["author"]["name"])
        assert_contains(resp, format_utc_iso_date(log["date"]))
        assert_contains(resp, escape(log["message"]))
        assert_contains(resp, format_utc_iso_date(log["committer_date"]))
        assert_contains(resp, revision_url)

    if len(revision_log_sorted) <= per_page:
        return

    resp = check_html_get_response(client,
                                   next_page_url,
                                   status_code=200,
                                   template_used="browse/revision-log.html")

    prev_page_url = reverse(
        "browse-revision-log",
        url_args={"sha1_git": revision},
        query_params={
            "offset": 0,
            "per_page": per_page
        },
    )
    next_page_url = reverse(
        "browse-revision-log",
        url_args={"sha1_git": revision},
        query_params={
            "offset": 2 * per_page,
            "per_page": per_page
        },
    )

    nb_log_entries = len(revision_log_sorted) - per_page
    if nb_log_entries > per_page:
        nb_log_entries = per_page

    assert_contains(resp,
                    '<tr class="swh-revision-log-entry',
                    count=nb_log_entries)

    assert_contains(
        resp,
        '<a class="page-link" href="%s">Newer</a>' % escape(prev_page_url))

    if len(revision_log_sorted) > 2 * per_page:
        assert_contains(
            resp,
            '<a class="page-link" href="%s">Older</a>' % escape(next_page_url),
        )

    if len(revision_log_sorted) <= 2 * per_page:
        return

    resp = check_html_get_response(client,
                                   next_page_url,
                                   status_code=200,
                                   template_used="browse/revision-log.html")

    prev_page_url = reverse(
        "browse-revision-log",
        url_args={"sha1_git": revision},
        query_params={
            "offset": per_page,
            "per_page": per_page
        },
    )
    next_page_url = reverse(
        "browse-revision-log",
        url_args={"sha1_git": revision},
        query_params={
            "offset": 3 * per_page,
            "per_page": per_page
        },
    )

    nb_log_entries = len(revision_log_sorted) - 2 * per_page
    if nb_log_entries > per_page:
        nb_log_entries = per_page

    assert_contains(resp,
                    '<tr class="swh-revision-log-entry',
                    count=nb_log_entries)
    assert_contains(
        resp,
        '<a class="page-link" href="%s">Newer</a>' % escape(prev_page_url))

    if len(revision_log_sorted) > 3 * per_page:
        assert_contains(
            resp,
            '<a class="page-link" href="%s">Older</a>' % escape(next_page_url),
        )
Beispiel #9
0
def test_parse_iso8601_date_to_utc_ko(invalid_iso8601_timestamp):
    with pytest.raises(BadInputExc):
        utils.parse_iso8601_date_to_utc(invalid_iso8601_timestamp)
Beispiel #10
0
def test_parse_iso8601_date_to_utc_ok(input_timestamp, output_date):
    assert utils.parse_iso8601_date_to_utc(input_timestamp) == output_date