Esempio n. 1
0
def content_raw(request, query_string):
    """Django view that produces a raw display of a content identified
    by its hash value.

    The url that points to it is
    :http:get:`/browse/content/[(algo_hash):](hash)/raw/`
    """
    re_encode = bool(strtobool(request.GET.get("re_encode", "false")))
    algo, checksum = query.parse_hash(query_string)
    checksum = hash_to_hex(checksum)
    content_data = request_content(query_string,
                                   max_size=None,
                                   re_encode=re_encode)

    filename = request.GET.get("filename", None)
    if not filename:
        filename = "%s_%s" % (algo, checksum)

    if (content_data["mimetype"].startswith("text/")
            or content_data["mimetype"] == "inode/x-empty"):
        response = HttpResponse(content_data["raw_data"],
                                content_type="text/plain")
        response["Content-disposition"] = "filename=%s" % filename
    else:
        response = HttpResponse(content_data["raw_data"],
                                content_type="application/octet-stream")
        response["Content-disposition"] = "attachment; filename=%s" % filename
    return response
Esempio n. 2
0
def content_raw(request, query_string):
    """Django view that produces a raw display of a content identified
    by its hash value.

    The url that points to it is :http:get:`/browse/content/[(algo_hash):](hash)/raw/`
    """ # noqa
    try:
        reencode = bool(strtobool(request.GET.get('reencode', 'false')))
        algo, checksum = query.parse_hash(query_string)
        checksum = hash_to_hex(checksum)
        content_data = request_content(query_string,
                                       max_size=None,
                                       reencode=reencode)
    except Exception as exc:
        return handle_view_exception(request, exc)

    filename = request.GET.get('filename', None)
    if not filename:
        filename = '%s_%s' % (algo, checksum)

    if content_data['mimetype'].startswith('text/') or \
       content_data['mimetype'] == 'inode/x-empty':
        response = HttpResponse(content_data['raw_data'],
                                content_type="text/plain")
        response['Content-disposition'] = 'filename=%s' % filename
    else:
        response = HttpResponse(content_data['raw_data'],
                                content_type='application/octet-stream')
        response['Content-disposition'] = 'attachment; filename=%s' % filename
    return response
Esempio n. 3
0
def lookup_content_raw(q: str) -> Dict[str, Any]:
    """Lookup the content defined by q.

    Args:
        q: query string of the form <hash_algo:hash>

    Returns:
        dict with 'sha1' and 'data' keys.
        data representing its raw data decoded.

    Raises:
        NotFoundExc if the requested content is not found or
        if the content bytes are not available in the storage

    """
    c = lookup_content(q)
    content_sha1_bytes = hashutil.hash_to_bytes(c["checksums"]["sha1"])
    content_data = storage.content_get_data(content_sha1_bytes)
    if content_data is None:
        algo, hash_ = query.parse_hash(q)
        raise NotFoundExc(
            f"Bytes of content with {algo} checksum equals "
            f"to {hashutil.hash_to_hex(hash_)} are not available!")
    return converters.from_content({
        "sha1": content_sha1_bytes,
        "data": content_data
    })
Esempio n. 4
0
def search_hash(q):
    """Checks if the storage contains a given content checksum

    Args: query string of the form <hash_algo:hash>

    Returns: Dict with key found to True or False, according to
        whether the checksum is present or not

    """
    algo, hash = query.parse_hash(q)
    found = storage.content_find({algo: hash})
    return {'found': found is not None}
Esempio n. 5
0
def lookup_hash(q):
    """Checks if the storage contains a given content checksum

    Args: query string of the form <hash_algo:hash>

    Returns: Dict with key found containing the hash info if the
    hash is present, None if not.

    """
    algo, hash = query.parse_hash(q)
    found = storage.content_find({algo: hash})
    return {'found': converters.from_content(found),
            'algo': algo}
Esempio n. 6
0
def search_hash(q: str) -> Dict[str, bool]:
    """Search storage for a given content checksum.

    Args:
        q: query string of the form <hash_algo:hash>

    Returns:
        Dict with key found to True or False, according to
        whether the checksum is present or not

    """
    algo, hash_ = query.parse_hash(q)
    found = _first_element(storage.content_find({algo: hash_}))
    return {"found": found is not None}
Esempio n. 7
0
def enrich_content(
    content: Dict[str, Any],
    top_url: Optional[bool] = False,
    query_string: Optional[str] = None,
    request: Optional[HttpRequest] = None,
) -> Dict[str, str]:
    """Enrich content with links to:
        - data_url: its raw data
        - filetype_url: its filetype information
        - language_url: its programming language information
        - license_url: its licensing information

    Args:
        content: dict of data associated to a swh content object
        top_url: whether or not to include the content url in
            the enriched data
        query_string: optional query string of type '<algo>:<hash>'
            used when requesting the content, it acts as a hint
            for picking the same hash method when computing
            the url listed above
        request: Absolute URIs will be generated if provided

    Returns:
        An enriched content dict filled with additional urls

    """
    checksums = content
    if "checksums" in content:
        checksums = content["checksums"]
    hash_algo = "sha1"
    if query_string:
        hash_algo = parse_hash(query_string)[0]
    if hash_algo in checksums:
        q = "%s:%s" % (hash_algo, checksums[hash_algo])
        if top_url:
            content["content_url"] = reverse("api-1-content", url_args={"q": q})
        content["data_url"] = reverse(
            "api-1-content-raw", url_args={"q": q}, request=request
        )
        content["filetype_url"] = reverse(
            "api-1-content-filetype", url_args={"q": q}, request=request
        )
        content["language_url"] = reverse(
            "api-1-content-language", url_args={"q": q}, request=request
        )
        content["license_url"] = reverse(
            "api-1-content-license", url_args={"q": q}, request=request
        )

    return content
Esempio n. 8
0
def lookup_content(q):
    """Lookup the content designed by q.

    Args:
        q: The release's sha1 as hexadecimal

    Raises:
        NotFoundExc if the requested content is not found

    """
    algo, hash = query.parse_hash(q)
    c = storage.content_find({algo: hash})
    if not c:
        raise NotFoundExc('Content with %s checksum equals to %s not found!' %
                          (algo, hashutil.hash_to_hex(hash)))
    return converters.from_content(c)
Esempio n. 9
0
def lookup_content(q: str) -> Dict[str, Any]:
    """Lookup the content designed by q.

    Args:
        q: The release's sha1 as hexadecimal

    Raises:
        NotFoundExc if the requested content is not found

    """
    algo, hash_ = query.parse_hash(q)
    c = _first_element(storage.content_find({algo: hash_}))
    if not c:
        hhex = hashutil.hash_to_hex(hash_)
        raise NotFoundExc(
            f"Content with {algo} checksum equals to {hhex} not found!")
    return converters.from_content(c.to_dict())
Esempio n. 10
0
def _lookup_content_sha1(q):
    """Given a possible input, query for the content's sha1.

    Args:
        q: query string of the form <hash_algo:hash>

    Returns:
        binary sha1 if found or None

    """
    algo, hash = query.parse_hash(q)
    if algo != 'sha1':
        hashes = storage.content_find({algo: hash})
        if not hashes:
            return None
        return hashes['sha1']
    return hash
Esempio n. 11
0
def _lookup_content_sha1(q: str) -> Optional[bytes]:
    """Given a possible input, query for the content's sha1.

    Args:
        q: query string of the form <hash_algo:hash>

    Returns:
        binary sha1 if found or None

    """
    algo, hash_ = query.parse_hash(q)
    if algo != "sha1":
        hashes = _first_element(storage.content_find({algo: hash_}))
        if not hashes:
            return None
        return hashes.sha1
    return hash_
Esempio n. 12
0
def lookup_hash(q: str) -> Dict[str, Any]:
    """Check if the storage contains a given content checksum and return it if found.

    Args:
        q: query string of the form <hash_algo:hash>

    Returns:
        Dict with key found containing the hash info if the
    hash is present, None if not.

    """
    algo, hash_ = query.parse_hash(q)
    found = _first_element(storage.content_find({algo: hash_}))
    if found:
        content = converters.from_content(found.to_dict())
    else:
        content = None
    return {"found": content, "algo": algo}
Esempio n. 13
0
def lookup_content_raw(q):
    """Lookup the content defined by q.

    Args:
        q: query string of the form <hash_algo:hash>

    Returns:
        dict with 'sha1' and 'data' keys.
        data representing its raw data decoded.

    Raises:
        NotFoundExc if the requested content is not found or
        if the content bytes are not available in the storage

    """
    c = lookup_content(q)
    content_sha1_bytes = hashutil.hash_to_bytes(c['checksums']['sha1'])
    content = _first_element(storage.content_get([content_sha1_bytes]))
    if not content:
        algo, hash = query.parse_hash(q)
        raise NotFoundExc('Bytes of content with %s checksum equals to %s '
                          'are not available!' %
                          (algo, hashutil.hash_to_hex(hash)))
    return converters.from_content(content)
Esempio n. 14
0
 def test_parse_hash_check_sha256(self):
     h = '084C799CD551DD1D8D5C5F9A5D593B2E931F5E36122ee5c793c1d08a19839cc0'
     r = query.parse_hash('sha256:' + h)
     self.assertEqual(r, ('sha256', hashutil.hash_to_bytes(h)))
Esempio n. 15
0
 def test_parse_hash_check_sha1_git(self):
     h = 'e1d2d2f924e986ac86fdf7b36c94bcdf32beec15'
     r = query.parse_hash('sha1_git:' + h)
     self.assertEqual(r, ('sha1_git', hashutil.hash_to_bytes(h)))
Esempio n. 16
0
 def test_parse_hash_guess_algo_malformed_hash(self):
     with self.assertRaises(BadInputExc):
         query.parse_hash('1234567890987654')
Esempio n. 17
0
 def test_parse_hash_guess_sha1(self):
     h = 'f1d2d2f924e986ac86fdf7b36c94bcdf32beec15'
     r = query.parse_hash(h)
     self.assertEqual(r, ('sha1', hashutil.hash_to_bytes(h)))
Esempio n. 18
0
 def test_parse_hash_malformed_query_with_more_than_2_parts(self):
     with self.assertRaises(BadInputExc):
         query.parse_hash('sha1:1234567890987654:other-stuff')
Esempio n. 19
0
def test_parse_hash_check_algo_unknown_one():
    with pytest.raises(BadInputExc):
        query.parse_hash("sha2:1234567890987654")
Esempio n. 20
0
def test_parse_hash_check_algo_malformed_sha256_hash():
    with pytest.raises(BadInputExc):
        query.parse_hash("sha256:1234567890987654")
Esempio n. 21
0
def test_parse_hash_check_sha256():
    h = "084C799CD551DD1D8D5C5F9A5D593B2E931F5E36122ee5c793c1d08a19839cc0"
    r = query.parse_hash("sha256:" + h)
    assert r == ("sha256", hashutil.hash_to_bytes(h))
Esempio n. 22
0
def test_parse_hash_check_sha1_git():
    h = "e1d2d2f924e986ac86fdf7b36c94bcdf32beec15"
    r = query.parse_hash("sha1_git:" + h)
    assert r == ("sha1_git", hashutil.hash_to_bytes(h))
Esempio n. 23
0
def test_parse_hash_guess_algo_malformed_hash():
    with pytest.raises(BadInputExc):
        query.parse_hash("1234567890987654")
Esempio n. 24
0
def test_parse_hash_guess_sha1():
    h = "f1d2d2f924e986ac86fdf7b36c94bcdf32beec15"
    r = query.parse_hash(h)
    assert r == ("sha1", hashutil.hash_to_bytes(h))
Esempio n. 25
0
def test_parse_hash_malformed_query_with_more_than_2_parts():
    with pytest.raises(BadInputExc):
        query.parse_hash("sha1:1234567890987654:other-stuff")
Esempio n. 26
0
def content_display(request, query_string):
    """Django view that produces an HTML display of a content identified
    by its hash value.

    The url that points to it is
    :http:get:`/browse/content/[(algo_hash):](hash)/`
    """
    algo, checksum = query.parse_hash(query_string)
    checksum = hash_to_hex(checksum)
    origin_url = request.GET.get("origin_url")
    selected_language = request.GET.get("language")
    if not origin_url:
        origin_url = request.GET.get("origin")
    snapshot_id = request.GET.get("snapshot")
    path = request.GET.get("path")
    content_data = {}
    error_info = {"status_code": 200, "description": None}
    try:
        content_data = request_content(query_string)
    except NotFoundExc as e:
        error_info["status_code"] = 404
        error_info["description"] = f"NotFoundExc: {str(e)}"

    snapshot_context = None
    if origin_url is not None or snapshot_id is not None:
        try:
            snapshot_context = get_snapshot_context(
                origin_url=origin_url,
                snapshot_id=snapshot_id,
                branch_name=request.GET.get("branch"),
                release_name=request.GET.get("release"),
                revision_id=request.GET.get("revision"),
                path=path,
                browse_context=CONTENT,
            )
        except NotFoundExc as e:
            if str(e).startswith("Origin"):
                raw_cnt_url = reverse("browse-content",
                                      url_args={"query_string": query_string})
                error_message = (
                    "The Software Heritage archive has a content "
                    "with the hash you provided but the origin "
                    "mentioned in your request appears broken: %s. "
                    "Please check the URL and try again.\n\n"
                    "Nevertheless, you can still browse the content "
                    "without origin information: %s" %
                    (gen_link(origin_url), gen_link(raw_cnt_url)))
                raise NotFoundExc(error_message)
            else:
                raise e
    content = None
    language = None
    mimetype = None
    if content_data.get("raw_data") is not None:
        content_display_data = prepare_content_for_display(
            content_data["raw_data"], content_data["mimetype"], path)
        content = content_display_data["content_data"]
        language = content_display_data["language"]
        mimetype = content_display_data["mimetype"]

    # Override language with user-selected language
    if selected_language is not None:
        language = selected_language

    available_languages = None

    if mimetype and "text/" in mimetype:
        available_languages = highlightjs.get_supported_languages()

    filename = None
    path_info = None
    directory_id = None

    root_dir = None
    if snapshot_context:
        root_dir = snapshot_context.get("root_directory")

    query_params = snapshot_context["query_params"] if snapshot_context else {}

    breadcrumbs = []

    if path:
        split_path = path.split("/")
        root_dir = root_dir or split_path[0]
        filename = split_path[-1]
        if root_dir != path:
            path = path.replace(root_dir + "/", "")
            path = path[:-len(filename)]
            path_info = gen_path_info(path)
            query_params.pop("path", None)
            dir_url = reverse(
                "browse-directory",
                url_args={"sha1_git": root_dir},
                query_params=query_params,
            )
            breadcrumbs.append({"name": root_dir[:7], "url": dir_url})
            for pi in path_info:
                query_params["path"] = pi["path"]
                dir_url = reverse(
                    "browse-directory",
                    url_args={"sha1_git": root_dir},
                    query_params=query_params,
                )
                breadcrumbs.append({"name": pi["name"], "url": dir_url})
        breadcrumbs.append({"name": filename, "url": None})

    if path and root_dir != path:
        dir_info = archive.lookup_directory_with_path(root_dir, path)
        directory_id = dir_info["target"]
    elif root_dir != path:
        directory_id = root_dir
    else:
        root_dir = None

    query_params = {"filename": filename}

    content_checksums = content_data.get("checksums", {})

    content_url = reverse(
        "browse-content",
        url_args={"query_string": query_string},
    )

    content_raw_url = reverse(
        "browse-content-raw",
        url_args={"query_string": query_string},
        query_params=query_params,
    )

    content_metadata = ContentMetadata(
        object_type=CONTENT,
        object_id=content_checksums.get("sha1_git"),
        sha1=content_checksums.get("sha1"),
        sha1_git=content_checksums.get("sha1_git"),
        sha256=content_checksums.get("sha256"),
        blake2s256=content_checksums.get("blake2s256"),
        content_url=content_url,
        mimetype=content_data.get("mimetype"),
        encoding=content_data.get("encoding"),
        size=filesizeformat(content_data.get("length", 0)),
        language=content_data.get("language"),
        root_directory=root_dir,
        path=f"/{path}" if path else None,
        filename=filename or "",
        directory=directory_id,
        revision=None,
        release=None,
        snapshot=None,
        origin_url=origin_url,
    )

    swh_objects = [
        SWHObjectInfo(object_type=CONTENT,
                      object_id=content_checksums.get("sha1_git"))
    ]

    if directory_id:
        swh_objects.append(
            SWHObjectInfo(object_type=DIRECTORY, object_id=directory_id))

    if snapshot_context:
        swh_objects.append(
            SWHObjectInfo(object_type=REVISION,
                          object_id=snapshot_context["revision_id"]))
        swh_objects.append(
            SWHObjectInfo(object_type=SNAPSHOT,
                          object_id=snapshot_context["snapshot_id"]))
        if snapshot_context["release_id"]:
            swh_objects.append(
                SWHObjectInfo(object_type=RELEASE,
                              object_id=snapshot_context["release_id"]))

    swhids_info = get_swhids_info(
        swh_objects,
        snapshot_context,
        extra_context=content_metadata,
    )

    heading = "Content - %s" % content_checksums.get("sha1_git")
    if breadcrumbs:
        content_path = "/".join([bc["name"] for bc in breadcrumbs])
        heading += " - %s" % content_path

    return render(
        request,
        "browse/content.html",
        {
            "heading":
            heading,
            "swh_object_id":
            swhids_info[0]["swhid"],
            "swh_object_name":
            "Content",
            "swh_object_metadata":
            content_metadata,
            "content":
            content,
            "content_size":
            content_data.get("length"),
            "max_content_size":
            content_display_max_size,
            "filename":
            filename,
            "encoding":
            content_data.get("encoding"),
            "mimetype":
            mimetype,
            "language":
            language,
            "available_languages":
            available_languages,
            "breadcrumbs":
            breadcrumbs,
            "top_right_link": {
                "url": content_raw_url,
                "icon": swh_object_icons["content"],
                "text": "Raw File",
            },
            "snapshot_context":
            snapshot_context,
            "vault_cooking":
            None,
            "show_actions":
            True,
            "swhids_info":
            swhids_info,
            "error_code":
            error_info["status_code"],
            "error_message":
            http_status_code_message.get(error_info["status_code"]),
            "error_description":
            error_info["description"],
        },
        status=error_info["status_code"],
    )
Esempio n. 27
0
 def test_parse_hash_check_algo_malformed_sha256_hash(self):
     with self.assertRaises(BadInputExc):
         query.parse_hash('sha256:1234567890987654')
Esempio n. 28
0
 def test_parse_hash_check_algo_unknown_one(self):
     with self.assertRaises(BadInputExc):
         query.parse_hash('sha2:1234567890987654')
Esempio n. 29
0
def content_display(request, query_string):
    """Django view that produces an HTML display of a content identified
    by its hash value.

    The url that points to it is :http:get:`/browse/content/[(algo_hash):](hash)/`
    """ # noqa
    try:
        algo, checksum = query.parse_hash(query_string)
        checksum = hash_to_hex(checksum)
        content_data = request_content(query_string,
                                       raise_if_unavailable=False)
        origin_type = request.GET.get('origin_type', None)
        origin_url = request.GET.get('origin_url', None)
        if not origin_url:
            origin_url = request.GET.get('origin', None)
        snapshot_context = None
        if origin_url:
            try:
                snapshot_context = get_snapshot_context(
                    None, origin_type, origin_url)
            except Exception:
                raw_cnt_url = reverse('browse-content',
                                      url_args={'query_string': query_string})
                error_message = \
                    ('The Software Heritage archive has a content '
                     'with the hash you provided but the origin '
                     'mentioned in your request appears broken: %s. '
                     'Please check the URL and try again.\n\n'
                     'Nevertheless, you can still browse the content '
                     'without origin information: %s'
                        % (gen_link(origin_url), gen_link(raw_cnt_url)))

                raise NotFoundExc(error_message)
        if snapshot_context:
            snapshot_context['visit_info'] = None
    except Exception as exc:
        return handle_view_exception(request, exc)

    path = request.GET.get('path', None)

    content = None
    language = None
    mimetype = None
    if content_data['raw_data'] is not None:
        content_display_data = prepare_content_for_display(
            content_data['raw_data'], content_data['mimetype'], path)
        content = content_display_data['content_data']
        language = content_display_data['language']
        mimetype = content_display_data['mimetype']

    root_dir = None
    filename = None
    path_info = None

    query_params = {'origin': origin_url}

    breadcrumbs = []

    if path:
        split_path = path.split('/')
        root_dir = split_path[0]
        filename = split_path[-1]
        if root_dir != path:
            path = path.replace(root_dir + '/', '')
            path = path[:-len(filename)]
            path_info = gen_path_info(path)
            dir_url = reverse('browse-directory',
                              url_args={'sha1_git': root_dir},
                              query_params=query_params)
            breadcrumbs.append({'name': root_dir[:7], 'url': dir_url})
            for pi in path_info:
                dir_url = reverse('browse-directory',
                                  url_args={
                                      'sha1_git': root_dir,
                                      'path': pi['path']
                                  },
                                  query_params=query_params)
                breadcrumbs.append({'name': pi['name'], 'url': dir_url})
        breadcrumbs.append({'name': filename, 'url': None})

    query_params = {'filename': filename}

    content_raw_url = reverse('browse-content-raw',
                              url_args={'query_string': query_string},
                              query_params=query_params)

    content_metadata = {
        'sha1 checksum': content_data['checksums']['sha1'],
        'sha1_git checksum': content_data['checksums']['sha1_git'],
        'sha256 checksum': content_data['checksums']['sha256'],
        'blake2s256 checksum': content_data['checksums']['blake2s256'],
        'mime type': content_data['mimetype'],
        'encoding': content_data['encoding'],
        'size': filesizeformat(content_data['length']),
        'language': content_data['language'],
        'licenses': content_data['licenses'],
        'filename': filename
    }

    if filename:
        content_metadata['filename'] = filename

    sha1_git = content_data['checksums']['sha1_git']
    swh_ids = get_swh_persistent_ids([{'type': 'content', 'id': sha1_git}])

    heading = 'Content - %s' % sha1_git
    if breadcrumbs:
        content_path = '/'.join([bc['name'] for bc in breadcrumbs])
        heading += ' - %s' % content_path

    return render(request,
                  'browse/content.html', {
                      'heading': heading,
                      'swh_object_id': swh_ids[0]['swh_id'],
                      'swh_object_name': 'Content',
                      'swh_object_metadata': content_metadata,
                      'content': content,
                      'content_size': content_data['length'],
                      'max_content_size': content_display_max_size,
                      'mimetype': mimetype,
                      'language': language,
                      'breadcrumbs': breadcrumbs,
                      'top_right_link': {
                          'url': content_raw_url,
                          'icon': swh_object_icons['content'],
                          'text': 'Raw File'
                      },
                      'snapshot_context': snapshot_context,
                      'vault_cooking': None,
                      'show_actions_menu': True,
                      'swh_ids': swh_ids,
                      'error_code': content_data['error_code'],
                      'error_message': content_data['error_message'],
                      'error_description': content_data['error_description']
                  },
                  status=content_data['error_code'])