def _get_visit_info_for_save_request(save_request): visit_date = None visit_status = None time_now = datetime.now(tz=timezone.utc) time_delta = time_now - save_request.request_date # stop trying to find a visit date one month after save request submission # as those requests to storage are expensive and associated loading task # surely ended up with errors if time_delta.days <= 30: try: origin = {"url": save_request.origin_url} origin_info = archive.lookup_origin(origin) origin_visits = get_origin_visits(origin_info) visit_dates = [ parse_iso8601_date_to_utc(v["date"]) for v in origin_visits ] i = bisect_right(visit_dates, save_request.request_date) if i != len(visit_dates): visit_date = visit_dates[i] visit_status = origin_visits[i]["status"] if origin_visits[i]["status"] not in ("full", "partial", "not_found"): visit_date = None except Exception as exc: sentry_sdk.capture_exception(exc) return visit_date, visit_status
def _origin_visits_browse(request, origin_url): if origin_url is None: raise BadInputExc("An origin URL must be provided as query parameter.") origin_info = archive.lookup_origin({"url": origin_url}) origin_visits = get_origin_visits(origin_info) snapshot_context = get_snapshot_context(origin_url=origin_url) for i, visit in enumerate(origin_visits): url_date = format_utc_iso_date(visit["date"], "%Y-%m-%dT%H:%M:%SZ") visit["formatted_date"] = format_utc_iso_date(visit["date"]) query_params = {"origin_url": origin_url, "timestamp": url_date} if i < len(origin_visits) - 1: if visit["date"] == origin_visits[i + 1]["date"]: query_params = {"visit_id": visit["visit"]} if i > 0: if visit["date"] == origin_visits[i - 1]["date"]: query_params = {"visit_id": visit["visit"]} snapshot = visit["snapshot"] if visit["snapshot"] else "" visit["url"] = reverse( "browse-origin-directory", query_params=query_params, ) if not snapshot: visit["snapshot"] = "" visit["date"] = parse_iso8601_date_to_utc(visit["date"]).timestamp() heading = "Origin visits - %s" % origin_url return render( request, "browse/origin-visits.html", { "heading": heading, "swh_object_name": "Visits", "swh_object_metadata": origin_info, "origin_visits": origin_visits, "origin_info": origin_info, "snapshot_context": snapshot_context, "vault_cooking": None, "show_actions": False, }, )
def get_origin_visits(origin_info: OriginInfo) -> List[OriginVisitInfo]: """Function that returns the list of visits for a swh origin. That list is put in cache in order to speedup the navigation in the swh web browse ui. The returned visits are sorted according to their date in ascending order. Args: origin_info: dict describing the origin to fetch visits from Returns: A list of dict describing the origin visits Raises: swh.web.common.exc.NotFoundExc: if the origin is not found """ from swh.web.common import archive if "url" in origin_info: origin_url = origin_info["url"] else: origin_url = archive.lookup_origin(origin_info)["url"] cache_entry_id = "origin_visits_%s" % origin_url cache_entry = cache.get(cache_entry_id) if cache_entry: last_visit = cache_entry[-1]["visit"] new_visits = list( archive.lookup_origin_visits(origin_url, last_visit=last_visit)) if not new_visits: last_snp = archive.lookup_latest_origin_snapshot(origin_url) if not last_snp or last_snp["id"] == cache_entry[-1]["snapshot"]: return cache_entry origin_visits = [] per_page = archive.MAX_LIMIT last_visit = None while 1: visits = list( archive.lookup_origin_visits(origin_url, last_visit=last_visit, per_page=per_page)) origin_visits += visits if len(visits) < per_page: break else: if not last_visit: last_visit = per_page else: last_visit += per_page def _visit_sort_key(visit): ts = parse_iso8601_date_to_utc(visit["date"]).timestamp() return ts + (float(visit["visit"]) / 10e3) origin_visits = sorted(origin_visits, key=lambda v: _visit_sort_key(v)) cache.set(cache_entry_id, origin_visits) return origin_visits
def test_lookup_origin_single_slash_after_protocol(archive_data): origin_url = "http://snapshot.debian.org/package/r-base/" malformed_origin_url = "http:/snapshot.debian.org/package/r-base/" archive_data.origin_add([Origin(url=origin_url)]) origin_info = archive.lookup_origin({"url": malformed_origin_url}) assert origin_info["url"] == origin_url
def test_lookup_origin_missing_trailing_slash(archive_data): deb_origin = Origin(url="http://snapshot.debian.org/package/r-base/") archive_data.origin_add([deb_origin]) origin_info = archive.lookup_origin({"url": deb_origin.url[:-1]}) assert origin_info["url"] == deb_origin.url
def test_lookup_origin_extra_trailing_slash(origin): origin_info = archive.lookup_origin({"url": f"{origin['url']}/"}) assert origin_info["url"] == origin["url"]
def test_lookup_origin(archive_data, new_origin): archive_data.origin_add([new_origin]) actual_origin = archive.lookup_origin({"url": new_origin.url}) expected_origin = archive_data.origin_get([new_origin.url])[0] assert actual_origin == expected_origin
def resolve_swhid( swhid: str, query_params: Optional[QueryParameters] = None ) -> ResolvedSWHID: """ Try to resolve a SoftWare Heritage persistent IDentifier into an url for browsing the targeted object. Args: swhid: a SoftWare Heritage persistent IDentifier query_params: optional dict filled with query parameters to append to the browse url Returns: a dict with the following keys: * **swhid_parsed**: the parsed identifier * **browse_url**: the url for browsing the targeted object """ swhid_parsed = get_swhid(swhid) object_type = swhid_parsed.object_type object_id = swhid_parsed.object_id browse_url = None url_args = {} query_dict = QueryDict("", mutable=True) fragment = "" process_lines = object_type == ObjectType.CONTENT if query_params and len(query_params) > 0: for k in sorted(query_params.keys()): query_dict[k] = query_params[k] if swhid_parsed.origin: origin_url = unquote(swhid_parsed.origin) origin_url = archive.lookup_origin({"url": origin_url})["url"] query_dict["origin_url"] = origin_url if swhid_parsed.path and swhid_parsed.path != b"/": query_dict["path"] = swhid_parsed.path.decode("utf8", errors="replace") if swhid_parsed.anchor: directory = b"" if swhid_parsed.anchor.object_type == ObjectType.DIRECTORY: directory = swhid_parsed.anchor.object_id elif swhid_parsed.anchor.object_type == ObjectType.REVISION: revision = archive.lookup_revision( hash_to_hex(swhid_parsed.anchor.object_id) ) directory = revision["directory"] elif swhid_parsed.anchor.object_type == ObjectType.RELEASE: release = archive.lookup_release( hash_to_hex(swhid_parsed.anchor.object_id) ) if release["target_type"] == REVISION: revision = archive.lookup_revision(release["target"]) directory = revision["directory"] if object_type == ObjectType.CONTENT: if not swhid_parsed.origin: # when no origin context, content objects need to have their # path prefixed by root directory id for proper breadcrumbs display query_dict["path"] = hash_to_hex(directory) + query_dict["path"] else: # remove leading slash from SWHID content path query_dict["path"] = query_dict["path"][1:] elif object_type == ObjectType.DIRECTORY: object_id = directory # remove leading and trailing slashes from SWHID directory path if query_dict["path"].endswith("/"): query_dict["path"] = query_dict["path"][1:-1] else: query_dict["path"] = query_dict["path"][1:] # snapshot context if swhid_parsed.visit: if swhid_parsed.visit.object_type != ObjectType.SNAPSHOT: raise BadInputExc("Visit must be a snapshot SWHID.") query_dict["snapshot"] = hash_to_hex(swhid_parsed.visit.object_id) if swhid_parsed.anchor: if swhid_parsed.anchor.object_type == ObjectType.REVISION: # check if the anchor revision is the tip of a branch branch_name = archive.lookup_snapshot_branch_name_from_tip_revision( hash_to_hex(swhid_parsed.visit.object_id), hash_to_hex(swhid_parsed.anchor.object_id), ) if branch_name: query_dict["branch"] = branch_name elif object_type != ObjectType.REVISION: query_dict["revision"] = hash_to_hex(swhid_parsed.anchor.object_id) elif swhid_parsed.anchor.object_type == ObjectType.RELEASE: release = archive.lookup_release( hash_to_hex(swhid_parsed.anchor.object_id) ) if release: query_dict["release"] = release["name"] if object_type == ObjectType.REVISION and "release" not in query_dict: branch_name = archive.lookup_snapshot_branch_name_from_tip_revision( hash_to_hex(swhid_parsed.visit.object_id), hash_to_hex(object_id) ) if branch_name: query_dict["branch"] = branch_name # browsing content or directory without snapshot context elif ( object_type in (ObjectType.CONTENT, ObjectType.DIRECTORY) and swhid_parsed.anchor ): if swhid_parsed.anchor.object_type == ObjectType.REVISION: # anchor revision, objects are browsed from its view object_type = ObjectType.REVISION object_id = swhid_parsed.anchor.object_id elif ( object_type == ObjectType.DIRECTORY and swhid_parsed.anchor.object_type == ObjectType.DIRECTORY ): # a directory is browsed from its root object_id = swhid_parsed.anchor.object_id if object_type == ObjectType.CONTENT: url_args["query_string"] = f"sha1_git:{hash_to_hex(object_id)}" elif object_type in (ObjectType.DIRECTORY, ObjectType.RELEASE, ObjectType.REVISION): url_args["sha1_git"] = hash_to_hex(object_id) elif object_type == ObjectType.SNAPSHOT: url_args["snapshot_id"] = hash_to_hex(object_id) if swhid_parsed.lines and process_lines: lines = swhid_parsed.lines fragment += "#L" + str(lines[0]) if lines[1]: fragment += "-L" + str(lines[1]) if url_args: browse_url = ( reverse( f"browse-{object_type.name.lower()}", url_args=url_args, query_params=query_dict, ) + fragment ) return ResolvedSWHID(swhid_parsed=swhid_parsed, browse_url=browse_url)
def _swh_badge( request: HttpRequest, object_type: str, object_id: str, object_swhid: Optional[str] = "", ) -> HttpResponse: """ Generate a Software Heritage badge for a given object type and id. Args: request: input http request object_type: The type of swh object to generate a badge for, either *content*, *directory*, *revision*, *release*, *origin* or *snapshot* object_id: The id of the swh object, either an url for origin type or a *sha1* for other object types object_swhid: If provided, the object SWHID will not be recomputed Returns: HTTP response with content type *image/svg+xml* containing the SVG badge data. If the provided parameters are invalid, HTTP 400 status code will be returned. If the object can not be found in the archive, HTTP 404 status code will be returned. """ left_text = "error" whole_link = None try: if object_type == "origin": archive.lookup_origin({"url": object_id}) right_text = "repository" whole_link = reverse( "browse-origin", query_params={"origin_url": object_id} ) else: # when SWHID is provided, object type and id will be parsed # from it if object_swhid: parsed_swhid = QualifiedSWHID.from_string(object_swhid) object_type = parsed_swhid.object_type.name.lower() object_id = hash_to_hex(parsed_swhid.object_id) swh_object = archive.lookup_object(object_type, object_id) # remove SWHID qualified if any for badge text right_text = str( CoreSWHID( object_type=parsed_swhid.object_type, object_id=parsed_swhid.object_id, ) ) else: right_text = str( CoreSWHID( object_type=ObjectType[object_type.upper()], object_id=hash_to_bytes(object_id), ) ) swh_object = archive.lookup_object(object_type, object_id) whole_link = resolve_swhid(str(right_text))["browse_url"] # use release name for badge text if object_type == RELEASE: right_text = "release %s" % swh_object["name"] left_text = "archived" except (BadInputExc, ValidationError): right_text = f'invalid {object_type if object_type else "object"} id' object_type = "error" except NotFoundExc: right_text = f'{object_type if object_type else "object"} not found' object_type = "error" badge_data = badge( left_text=left_text, right_text=right_text, right_color=_badge_config[object_type]["color"], whole_link=request.build_absolute_uri(whole_link), whole_title=_badge_config[object_type]["title"], logo=_get_logo_data(), embed_logo=True, ) return HttpResponse(badge_data, content_type="image/svg+xml")