Beispiel #1
0
def test_graph_ndjson_response(api_client, keycloak_mock, requests_mock):
    _authenticate_graph_user(api_client, keycloak_mock)

    graph_query = "visit/paths/swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb"

    response_ndjson = textwrap.dedent("""\
        ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\
         "swh:1:cnt:acfb7cabd63b368a03a9df87670ece1488c8bce0"]
        ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\
         "swh:1:cnt:2a0837708151d76edf28fdbb90dc3eabc676cff3"]
        ["swh:1:dir:644dd466d8ad527ea3a609bfd588a3244e6dafcb",\
         "swh:1:cnt:eaf025ad54b94b2fdda26af75594cfae3491ec75"]
        """)

    requests_mock.get(
        get_config()["graph"]["server_url"] + graph_query,
        text=response_ndjson,
        headers={
            "Content-Type": "application/x-ndjson",
            "Transfer-Encoding": "chunked",
        },
    )

    url = reverse("api-1-graph", url_args={"graph_query": graph_query})

    resp = check_http_get_response(api_client, url, status_code=200)
    assert isinstance(resp, StreamingHttpResponse)
    assert resp["Content-Type"] == "application/x-ndjson"
    assert b"".join(resp.streaming_content) == response_ndjson.encode()
Beispiel #2
0
    def test_recaptcha_not_activated_server_side(
            self, mock_create_save_origin_request):

        swh_web_config = get_config()

        swh_web_config.update(
            {'grecaptcha': {
                'activated': False,
                'site_key': ''
            }})

        origin_type = 'git'
        origin_url = 'https://github.com/python/cpython'

        expected_data = {
            'origin_type': origin_type,
            'origin_url': origin_url,
            'save_request_date': datetime.now().isoformat(),
            'save_request_status': SAVE_REQUEST_ACCEPTED,
            'save_task_status': SAVE_TASK_NOT_YET_SCHEDULED,
            'visit_date': None
        }

        mock_create_save_origin_request.return_value = expected_data

        url = reverse('browse-origin-save-request',
                      url_args={
                          'origin_type': origin_type,
                          'origin_url': origin_url
                      })
        resp = self.client.post(url, data={}, content_type='application/json')

        save_request_data = json.loads(resp.content.decode('utf-8'))

        self.assertEqual(save_request_data, expected_data)
Beispiel #3
0
def api_graph_proxy(
    request: Request, graph_query: str
) -> Union[Response, StreamingHttpResponse]:
    if request.get_host() != SWH_WEB_INTERNAL_SERVER_NAME:
        if not bool(request.user and request.user.is_authenticated):
            return Response("Authentication credentials were not provided.", status=401)
        if not request.user.has_perm(API_GRAPH_PERM):
            return Response(
                "You do not have permission to perform this action.", status=403
            )
    graph_query_url = get_config()["graph"]["server_url"]
    graph_query_url += graph_query
    if request.GET:
        graph_query_url += "?" + request.GET.urlencode(safe="/;:")
    response = requests.get(graph_query_url, stream=True)
    # graph stats and counter endpoint responses are not streamed
    if response.headers.get("Transfer-Encoding") != "chunked":
        return Response(
            response.text,
            status=response.status_code,
            content_type=response.headers["Content-Type"],
        )
    # other endpoint responses are streamed
    else:
        resolve_origins = strtobool(request.GET.get("resolve_origins", "false"))
        if response.status_code == 200 and resolve_origins:
            response_stream = _resolve_origin_swhids_in_graph_response(response)
        else:
            response_stream = map(lambda line: line + b"\n", response.iter_lines())
        return StreamingHttpResponse(
            response_stream,
            status=response.status_code,
            content_type=response.headers["Content-Type"],
        )
Beispiel #4
0
def error_response(request, error, doc_data):
    """Private function to create a custom error response.

    Args:
        request: a DRF Request object
        error: the exception that caused the error
        doc_data: documentation data for HTML response

    """
    error_code = 400
    if isinstance(error, NotFoundExc):
        error_code = 404
    elif isinstance(error, ForbiddenExc):
        error_code = 403
    elif isinstance(error, StorageDBError):
        error_code = 503
    elif isinstance(error, StorageAPIError):
        error_code = 503

    error_opts = {'status': error_code}
    error_data = {
        'exception': error.__class__.__name__,
        'reason': str(error),
    }
    if get_config()['debug']:
        error_data['traceback'] = traceback.format_exc()

    return make_api_response(request, error_data, doc_data, options=error_opts)
Beispiel #5
0
def search_origin_metadata(fulltext: str,
                           limit: int = 50) -> Iterable[OriginMetadataInfo]:
    """Search for origins whose metadata match a provided string pattern.

    Args:
        fulltext: the string pattern to search for in origin metadata
        limit: the maximum number of found origins to return

    Returns:
        Iterable of origin metadata information for existing origins

    """
    results = []
    if search and config.get_config(
    )["metadata_search_backend"] == "swh-search":
        page_result = search.origin_search(
            metadata_pattern=fulltext,
            limit=limit,
        )
        matches = idx_storage.origin_intrinsic_metadata_get(
            [r["url"] for r in page_result.results])
    else:
        matches = idx_storage.origin_intrinsic_metadata_search_fulltext(
            conjunction=[fulltext], limit=limit)

    matches = [match.to_dict() for match in matches]
    origins = storage.origin_get([match["id"] for match in matches])
    for origin, match in zip(origins, matches):
        if not origin:
            continue
        match["from_revision"] = hashutil.hash_to_hex(match["from_revision"])
        del match["id"]
        results.append(OriginMetadataInfo(url=origin.url, metadata=match))

    return results
Beispiel #6
0
def _generate_and_test_bearer_token(client, kc_oidc_mock):
    # user authenticates
    client.login(code="code",
                 code_verifier="code-verifier",
                 redirect_uri="redirect-uri")
    # user initiates bearer token generation flow
    url = reverse("oidc-generate-bearer-token")
    response = check_http_get_response(client, url, status_code=302)
    request = response.wsgi_request
    redirect_uri = reverse("oidc-generate-bearer-token-complete",
                           request=request)
    # check login data and redirection to Keycloak is valid
    login_data = _check_oidc_login_code_flow_data(
        request,
        response,
        kc_oidc_mock,
        redirect_uri=redirect_uri,
        scope="openid offline_access",
    )

    # once a user has identified himself in Keycloak, he is
    # redirected to the 'oidc-generate-bearer-token-complete' view
    # to get and save bearer token

    # generate authorization code / session state in the same
    # manner as Keycloak
    code = f"{str(uuid.uuid4())}.{str(uuid.uuid4())}.{str(uuid.uuid4())}"
    session_state = str(uuid.uuid4())

    token_complete_url = reverse(
        "oidc-generate-bearer-token-complete",
        query_params={
            "code": code,
            "state": login_data["state"],
            "session_state": session_state,
        },
    )

    nb_tokens = len(OIDCUserOfflineTokens.objects.all())
    response = check_html_get_response(client,
                                       token_complete_url,
                                       status_code=302)
    request = response.wsgi_request

    # check token has been generated and saved encrypted to database
    assert len(OIDCUserOfflineTokens.objects.all()) == nb_tokens + 1
    encrypted_token = OIDCUserOfflineTokens.objects.last().offline_token
    secret = get_config()["secret_key"].encode()
    salt = request.user.sub.encode()
    decrypted_token = decrypt_data(encrypted_token, secret, salt)
    oidc_profile = kc_oidc_mock.authorization_code(code=code,
                                                   redirect_uri=redirect_uri)
    assert decrypted_token.decode("ascii") == oidc_profile["refresh_token"]

    # should redirect to tokens management Web UI
    assert response["location"] == reverse("oidc-profile") + "#tokens"

    return decrypted_token
Beispiel #7
0
def test_layout_without_oidc_auth_enabled(client, mocker):
    config = deepcopy(get_config())
    config["keycloak"]["server_url"] = ""
    mock_get_config = mocker.patch("swh.web.common.utils.get_config")
    mock_get_config.return_value = config

    url = reverse("swh-web-homepage")
    resp = check_http_get_response(client, url, status_code=200)
    assert_contains(resp, reverse("login"))
Beispiel #8
0
def _swh_coverage(request):
    count_origins = get_config()["coverage_count_origins"]
    return render(
        request,
        "misc/coverage.html",
        {
            "providers": _code_providers,
            "count_origins": count_origins
        },
    )
Beispiel #9
0
def context_processor(request):
    """
    Django context processor used to inject variables
    in all swh-web templates.
    """
    config = get_config()
    return {
        'swh_object_icons': swh_object_icons,
        'grecaptcha_activated': config['grecaptcha']['activated'],
        'grecaptcha_site_key': config['grecaptcha']['site_key']
    }
Beispiel #10
0
 def get_exempted_networks(self, scope_name):
     if not self.exempted_networks:
         scopes = get_config()['throttling']['scopes']
         scope = scopes.get(scope_name)
         if scope:
             networks = scope.get('exempted_networks')
             if networks:
                 self.exempted_networks = [
                     ipaddress.ip_network(network) for network in networks
                 ]
     return self.exempted_networks
Beispiel #11
0
def handle_view_exception(request, exc):
    """
    Function used to generate an error page when an exception
    was raised inside a swh-web browse view.
    """
    sentry_sdk.capture_exception(exc)
    error_code = 500
    error_description = "%s: %s" % (type(exc).__name__, str(exc))
    if get_config()["debug"]:
        error_description = traceback.format_exc()
        logger.debug(error_description)
    if isinstance(exc, BadInputExc):
        error_code = 400
    if isinstance(exc, ForbiddenExc):
        error_code = 403
    if isinstance(exc, NotFoundExc):
        error_code = 404

    resp = _generate_error_page(request, error_code, error_description)
    if get_config()["debug"]:
        resp.traceback = error_description
    return resp
Beispiel #12
0
def test_graph_endpoint_no_authentication_for_vpn_users(
        api_client, requests_mock):
    graph_query = "stats"
    url = reverse("api-1-graph", url_args={"graph_query": graph_query})
    requests_mock.get(
        get_config()["graph"]["server_url"] + graph_query,
        json={},
        headers={"Content-Type": "application/json"},
    )
    check_http_get_response(api_client,
                            url,
                            status_code=200,
                            server_name=SWH_WEB_INTERNAL_SERVER_NAME)
Beispiel #13
0
def oidc_get_bearer_token(request: HttpRequest) -> HttpResponse:
    if not request.user.is_authenticated or not isinstance(
            request.user, OIDCUser):
        return HttpResponseForbidden()
    try:
        data = json.loads(request.body.decode("ascii"))
        user = cast(OIDCUser, request.user)
        token_data = OIDCUserOfflineTokens.objects.get(id=data["token_id"])
        secret = get_config()["secret_key"].encode()
        salt = user.sub.encode()
        decrypted_token = decrypt_data(token_data.offline_token, secret, salt)
        return HttpResponse(decrypted_token.decode("ascii"),
                            content_type="text/plain")
    except InvalidToken:
        return HttpResponse(status=401)
Beispiel #14
0
def override_storages(storage, idx_storage, search):
    """
    Helper function to replace the storages from which archive data
    are fetched.
    """
    swh_config = config.get_config()
    swh_config.update({
        "storage": storage,
        "indexer_storage": idx_storage,
        "search": search,
    })

    archive.storage = storage
    archive.idx_storage = idx_storage
    archive.search = search
Beispiel #15
0
def test_graph_json_response(api_client, keycloak_mock, requests_mock):
    _authenticate_graph_user(api_client, keycloak_mock)

    graph_query = "stats"

    requests_mock.get(
        get_config()["graph"]["server_url"] + graph_query,
        json=_response_json,
        headers={"Content-Type": "application/json"},
    )

    url = reverse("api-1-graph", url_args={"graph_query": graph_query})

    resp = check_http_get_response(api_client, url, status_code=200)
    assert resp.content_type == "application/json"
    assert resp.content == json.dumps(_response_json).encode()
Beispiel #16
0
def test_graph_endpoint_needs_permission(api_client, keycloak_mock,
                                         requests_mock):
    graph_query = "stats"
    url = reverse("api-1-graph", url_args={"graph_query": graph_query})
    oidc_profile = keycloak_mock.login()
    api_client.credentials(
        HTTP_AUTHORIZATION=f"Bearer {oidc_profile['refresh_token']}")

    check_http_get_response(api_client, url, status_code=403)

    _authenticate_graph_user(api_client, keycloak_mock)
    requests_mock.get(
        get_config()["graph"]["server_url"] + graph_query,
        json={},
        headers={"Content-Type": "application/json"},
    )
    check_http_get_response(api_client, url, status_code=200)
Beispiel #17
0
def get_oidc_client(client_id: str = OIDC_SWH_WEB_CLIENT_ID) -> KeycloakOpenIDConnect:
    """
    Instantiate a KeycloakOpenIDConnect class for a given client in the
    SoftwareHeritage realm.

    Args:
        client_id: client identifier in the SoftwareHeritage realm

    Returns:
        An object to ease the interaction with the Keycloak server
    """
    keycloak_config = get_config()["keycloak"]

    if client_id not in _keycloak_oidc:
        _keycloak_oidc[client_id] = KeycloakOpenIDConnect(
            keycloak_config["server_url"], keycloak_config["realm_name"], client_id
        )
    return _keycloak_oidc[client_id]
Beispiel #18
0
def oidc_revoke_bearer_tokens(request: HttpRequest) -> HttpResponse:
    if not request.user.is_authenticated or not isinstance(
            request.user, OIDCUser):
        return HttpResponseForbidden()
    try:
        data = json.loads(request.body.decode("ascii"))
        user = cast(OIDCUser, request.user)
        for token_id in data["token_ids"]:
            token_data = OIDCUserOfflineTokens.objects.get(id=token_id)
            secret = get_config()["secret_key"].encode()
            salt = user.sub.encode()
            decrypted_token = decrypt_data(token_data.offline_token, secret,
                                           salt)
            oidc_client = get_oidc_client()
            oidc_client.logout(decrypted_token.decode("ascii"))
            token_data.delete()
        return HttpResponse(status=200)
    except InvalidToken:
        return HttpResponse(status=401)
Beispiel #19
0
    def setUpClass(cls):
        super().setUpClass()
        tests_data = get_tests_data()
        cls.storage = tests_data['storage']
        cls.idx_storage = tests_data['idx_storage']
        cls.mimetype_indexer = tests_data['mimetype_indexer']
        cls.language_indexer = tests_data['language_indexer']
        cls.license_indexer = tests_data['license_indexer']
        cls.ctags_indexer = tests_data['ctags_indexer']

        # Update swh-web configuration to use the in-memory storage
        # instantiated in the tests.data module
        swh_config = config.get_config()
        swh_config.update({'storage': cls.storage})
        service.storage = cls.storage

        # Update swh-web configuration to use the in-memory indexer storage
        # instantiated in the tests.data modules
        swh_config.update({'indexer_storage': cls.idx_storage})
        service.idx_storage = cls.idx_storage
Beispiel #20
0
def _stat_counters(request):
    stat_counters = archive.stat_counters()
    url = get_config()["history_counters_url"]
    stat_counters_history = {}
    try:
        response = requests.get(url, timeout=5)
        stat_counters_history = json.loads(response.text)
        for d, object_counts in _stat_counters_backfill.items():
            # convert date to javascript timestamp (in ms)
            timestamp = int(parse_iso8601_date_to_utc(d).timestamp()) * 1000
            for object_type, object_count in object_counts.items():
                stat_counters_history[object_type].append(
                    [timestamp, object_count])
    except Exception as exc:
        sentry_sdk.capture_exception(exc)

    counters = {
        "stat_counters": stat_counters,
        "stat_counters_history": stat_counters_history,
    }
    return JsonResponse(counters)
Beispiel #21
0
def handle_view_exception(request, exc, html_response=True):
    """
    Function used to generate an error page when an exception
    was raised inside a swh-web browse view.
    """
    error_code = 500
    error_description = '%s: %s' % (type(exc).__name__, str(exc))
    if get_config()['debug']:
        error_description = traceback.format_exc()
    if isinstance(exc, BadInputExc):
        error_code = 400
    if isinstance(exc, ForbiddenExc):
        error_code = 403
    if isinstance(exc, NotFoundExc):
        error_code = 404
    if html_response:
        return _generate_error_page(request, error_code, error_description)
    else:
        return HttpResponse(error_description,
                            content_type='text/plain',
                            status=error_code)
Beispiel #22
0
def test_graph_text_plain_response(api_client, keycloak_mock, requests_mock):
    _authenticate_graph_user(api_client, keycloak_mock)

    graph_query = "leaves/swh:1:dir:432d1b21c1256f7408a07c577b6974bbdbcc1323"

    response_text = textwrap.dedent("""\
        swh:1:cnt:1d3dace0a825b0535c37c53ed669ef817e9c1b47
        swh:1:cnt:6d5b280f4e33589ae967a7912a587dd5cb8dedaa
        swh:1:cnt:91bef238bf01356a550d416d14bb464c576ac6f4
        swh:1:cnt:58a8b925a463b87d49639fda282b8f836546e396
        swh:1:cnt:fd32ee0a87e16ccc853dfbeb7018674f9ce008c0
        swh:1:cnt:ab7c39871872589a4fc9e249ebc927fb1042c90d
        swh:1:cnt:93073c02bf3869845977527de16af4d54765838d
        swh:1:cnt:4251f795b52c54c447a97c9fe904d8b1f993b1e0
        swh:1:cnt:c6e7055424332006d07876ffeba684e7e284b383
        swh:1:cnt:8459d8867dc3b15ef7ae9683e21cccc9ab2ec887
        swh:1:cnt:5f9981d52202815aa947f85b9dfa191b66f51138
        swh:1:cnt:00a685ec51bcdf398c15d588ecdedb611dbbab4b
        swh:1:cnt:e1cf1ea335106a0197a2f92f7804046425a7d3eb
        swh:1:cnt:07069b38087f88ec192d2c9aff75a502476fd17d
        swh:1:cnt:f045ee845c7f14d903a2c035b2691a7c400c01f0
        """)

    requests_mock.get(
        get_config()["graph"]["server_url"] + graph_query,
        text=response_text,
        headers={
            "Content-Type": "text/plain",
            "Transfer-Encoding": "chunked"
        },
    )

    url = reverse("api-1-graph", url_args={"graph_query": graph_query})

    resp = check_http_get_response(api_client,
                                   url,
                                   status_code=200,
                                   content_type="text/plain")
    assert isinstance(resp, StreamingHttpResponse)
    assert b"".join(resp.streaming_content) == response_text.encode()
Beispiel #23
0
    def test_recaptcha_activation_in_gui(self, origin):

        swh_web_config = get_config()

        for captcha_activated in (True, False):

            swh_web_config.update({
                'grecaptcha': {
                    'activated': captcha_activated,
                    'site_key': ''
                }
            })

            url = reverse('browse-origin-save')
            resp = self.client.get(url)

            captcha_script_url = 'https://www.google.com/recaptcha/api.js'
            captcha_dom_elt = '<div class="g-recaptcha"'

            if captcha_activated:
                self.assertContains(resp, captcha_script_url)
                self.assertContains(resp, captcha_dom_elt)
            else:
                self.assertNotContains(resp, captcha_script_url)
                self.assertNotContains(resp, captcha_dom_elt)

            url = reverse('browse-origin-directory',
                          url_args={
                              'origin_type': origin['type'],
                              'origin_url': origin['url']
                          })

            resp = self.client.get(url)

            if captcha_activated:
                self.assertContains(resp, captcha_script_url)
                self.assertContains(resp, captcha_dom_elt)
            else:
                self.assertNotContains(resp, captcha_script_url)
                self.assertNotContains(resp, captcha_dom_elt)
Beispiel #24
0
def context_processor(request):
    """
    Django context processor used to inject variables
    in all swh-web templates.
    """
    config = get_config()
    if (hasattr(request, "user") and request.user.is_authenticated
            and not hasattr(request.user, "backend")):
        # To avoid django.template.base.VariableDoesNotExist errors
        # when rendering templates when standard Django user is logged in.
        request.user.backend = "django.contrib.auth.backends.ModelBackend"
    site_base_url = request.build_absolute_uri("/")
    return {
        "swh_object_icons":
        swh_object_icons,
        "available_languages":
        None,
        "swh_client_config":
        config["client_config"],
        "oidc_enabled":
        bool(config["keycloak"]["server_url"]),
        "browsers_supported_image_mimes":
        browsers_supported_image_mimes,
        "keycloak":
        config["keycloak"],
        "site_base_url":
        site_base_url,
        "DJANGO_SETTINGS_MODULE":
        os.environ["DJANGO_SETTINGS_MODULE"],
        "status":
        config["status"],
        "swh_web_staging":
        any([
            server_name in site_base_url
            for server_name in config["staging_server_names"]
        ]),
        "visit_types":
        ORIGIN_VISIT_TYPES,
    }
Beispiel #25
0
def oidc_generate_bearer_token_complete(request: HttpRequest) -> HttpResponse:
    if not request.user.is_authenticated or not isinstance(
            request.user, OIDCUser):
        raise ForbiddenExc("You are not allowed to generate bearer tokens.")
    if "error" in request.GET:
        raise Exception(request.GET["error"])

    oidc_client = get_oidc_client()
    login_data = _get_login_data(request)
    _check_login_data(request, login_data)
    oidc_profile = oidc_client.authorization_code(
        code=request.GET["code"],
        code_verifier=login_data["code_verifier"],
        redirect_uri=login_data["redirect_uri"],
    )
    user = cast(OIDCUser, request.user)
    token = oidc_profile["refresh_token"]
    secret = get_config()["secret_key"].encode()
    salt = user.sub.encode()
    encrypted_token = encrypt_data(token.encode(), secret, salt)
    OIDCUserOfflineTokens.objects.create(user_id=str(user.id),
                                         offline_token=encrypted_token).save()
    return HttpResponseRedirect(reverse("oidc-profile") + "#tokens")
Beispiel #26
0
def test_oidc_profile_view(client, keycloak_mock):
    """
    Authenticated users should be able to request the profile page
    and link to Keycloak account UI should be present.
    """
    url = reverse("oidc-profile")
    kc_config = get_config()["keycloak"]
    user_permissions = ["perm1", "perm2"]
    keycloak_mock.user_permissions = user_permissions
    client.login(code="", code_verifier="", redirect_uri="")
    resp = check_html_get_response(client,
                                   url,
                                   status_code=200,
                                   template_used="auth/profile.html")
    user = resp.wsgi_request.user
    kc_account_url = (
        f"{kc_config['server_url']}realms/{kc_config['realm_name']}/account/")
    assert_contains(resp, kc_account_url)
    assert_contains(resp, user.username)
    assert_contains(resp, user.first_name)
    assert_contains(resp, user.last_name)
    assert_contains(resp, user.email)
    for perm in user_permissions:
        assert_contains(resp, perm)
Beispiel #27
0
def is_recaptcha_valid(request, recaptcha_response):
    """
    Verify if the response for Google reCAPTCHA is valid.

    Args:
        request (django.http.HttpRequest): the incoming HTTP request
        recaptcha_response (str): the reCAPTCHA response

    Returns:
        bool: Whether the reCAPTCHA response is valid or not
    """
    config = get_config()
    if config['grecaptcha']['activated'] is False:
        recaptcha_valid = True
    else:
        recaptcha_valid = requests.post(
            config['grecaptcha']['validation_url'],
            data={
                'secret': config['grecaptcha']['private_key'],
                'response': recaptcha_response,
                'remoteip': get_client_ip(request)
            },
            verify=True).json().get("success", False)
    return recaptcha_valid
Beispiel #28
0
import requests
from requests.auth import HTTPBasicAuth
import sentry_sdk

from django.conf import settings
from django.contrib.admin.views.decorators import staff_member_required
from django.core.cache import cache
from django.core.paginator import Paginator
from django.http import JsonResponse
from django.shortcuts import render

from swh.web.admin.adminurls import admin_route
from swh.web.config import get_config

config = get_config()["deposit"]


@admin_route(r"deposit/", view_name="admin-deposit")
@staff_member_required(view_func=None, login_url=settings.LOGIN_URL)
def _admin_origin_save(request):
    return render(request, "admin/deposit.html")


@admin_route(r"deposit/list/", view_name="admin-deposit-list")
@staff_member_required(view_func=None, login_url=settings.LOGIN_URL)
def _admin_deposit_list(request):
    table_data = {}
    table_data["draw"] = int(request.GET["draw"])
    deposits_list_url = config["private_api_url"] + "deposits"
    deposits_list_auth = HTTPBasicAuth(
Beispiel #29
0
    Returns:
        A tuple (mimetype, encoding), for instance ('text/plain', 'us-ascii'),
        associated to the provided content.

    """
    m = magic.Magic(mime=True, mime_encoding=True)
    mime_encoding = m.from_buffer(content)
    mime_type, encoding = mime_encoding.split(";")
    encoding = encoding.replace(" charset=", "")
    return mime_type, encoding


# maximum authorized content size in bytes for HTML display
# with code highlighting
content_display_max_size = get_config()["content_display_max_size"]


def _re_encode_content(mimetype, encoding, content_data):
    # encode textual content to utf-8 if needed
    if mimetype.startswith("text/"):
        # probably a malformed UTF-8 content, re-encode it
        # by replacing invalid chars with a substitution one
        if encoding == "unknown-8bit":
            content_data = content_data.decode("utf-8", "replace").encode("utf-8")
        elif encoding not in ["utf-8", "binary"]:
            content_data = content_data.decode(encoding, "replace").encode("utf-8")
    elif mimetype.startswith("application/octet-stream"):
        # file may detect a text content as binary
        # so try to decode it for display
        encodings = ["us-ascii", "utf-8"]
Beispiel #30
0
def get_save_origin_task_info(save_request_id: int,
                              full_info: bool = True) -> Dict[str, Any]:
    """
    Get detailed information about an accepted save origin request
    and its associated loading task.

    If the associated loading task info is archived and removed
    from the scheduler database, returns an empty dictionary.

    Args:
        save_request_id: identifier of a save origin request
        full_info: whether to return detailed info for staff users

    Returns:
        A dictionary with the following keys:

            - **type**: loading task type
            - **arguments**: loading task arguments
            - **id**: loading task database identifier
            - **backend_id**: loading task celery identifier
            - **scheduled**: loading task scheduling date
            - **ended**: loading task termination date
            - **status**: loading task execution status

        Depending on the availability of the task logs in the elasticsearch
        cluster of Software Heritage, the returned dictionary may also
        contain the following keys:

            - **name**: associated celery task name
            - **message**: relevant log message from task execution
            - **duration**: task execution time (only if it succeeded)
            - **worker**: name of the worker that executed the task
    """
    try:
        save_request = SaveOriginRequest.objects.get(id=save_request_id)
    except ObjectDoesNotExist:
        return {}

    task = scheduler.get_tasks([save_request.loading_task_id])
    task = task[0] if task else None
    if task is None:
        return {}

    task_run = scheduler.get_task_runs([task["id"]])
    task_run = task_run[0] if task_run else None
    if task_run is None:
        return {}
    task_run["type"] = task["type"]
    task_run["arguments"] = task["arguments"]
    task_run["id"] = task_run["task"]
    del task_run["task"]
    del task_run["metadata"]

    es_workers_index_url = config.get_config()["es_workers_index_url"]
    if not es_workers_index_url:
        return task_run
    es_workers_index_url += "/_search"

    if save_request.visit_date:
        min_ts = save_request.visit_date
        max_ts = min_ts + timedelta(days=7)
    else:
        min_ts = save_request.request_date
        max_ts = min_ts + timedelta(days=30)
    min_ts_unix = int(min_ts.timestamp()) * 1000
    max_ts_unix = int(max_ts.timestamp()) * 1000

    save_task_status = _save_task_status[task["status"]]
    priority = "3" if save_task_status == SAVE_TASK_FAILED else "6"

    query = {
        "bool": {
            "must": [
                {
                    "match_phrase": {
                        "priority": {
                            "query": priority
                        }
                    }
                },
                {
                    "match_phrase": {
                        "swh_task_id": {
                            "query": task_run["backend_id"]
                        }
                    }
                },
                {
                    "range": {
                        "@timestamp": {
                            "gte": min_ts_unix,
                            "lte": max_ts_unix,
                            "format": "epoch_millis",
                        }
                    }
                },
            ]
        }
    }

    try:
        response = requests.post(
            es_workers_index_url,
            json={
                "query": query,
                "sort": ["@timestamp"]
            },
            timeout=30,
        )
        results = json.loads(response.text)
        if results["hits"]["total"]["value"] >= 1:
            task_run_info = results["hits"]["hits"][-1]["_source"]
            if "swh_logging_args_runtime" in task_run_info:
                duration = task_run_info["swh_logging_args_runtime"]
                task_run["duration"] = duration
            if "message" in task_run_info:
                task_run["message"] = task_run_info["message"]
            if "swh_logging_args_name" in task_run_info:
                task_run["name"] = task_run_info["swh_logging_args_name"]
            elif "swh_task_name" in task_run_info:
                task_run["name"] = task_run_info["swh_task_name"]
            if "hostname" in task_run_info:
                task_run["worker"] = task_run_info["hostname"]
            elif "host" in task_run_info:
                task_run["worker"] = task_run_info["host"]
    except Exception as exc:
        logger.warning("Request to Elasticsearch failed\n%s", exc)
        sentry_sdk.capture_exception(exc)

    if not full_info:
        for field in ("id", "backend_id", "worker"):
            # remove some staff only fields
            task_run.pop(field, None)
        if "message" in task_run and "Loading failure" in task_run["message"]:
            # hide traceback for non staff users, only display exception
            message_lines = task_run["message"].split("\n")
            message = ""
            for line in message_lines:
                if line.startswith("Traceback"):
                    break
                message += f"{line}\n"
            message += message_lines[-1]
            task_run["message"] = message

    return task_run