Example #1
0
    def _get_page_content_from_storage(self, project, version_slug, filename):
        version = get_object_or_404(
            project.versions,
            slug=version_slug,
            # Only allow PUBLIC versions when getting the content from our
            # storage for privacy/security reasons
            privacy_level=PUBLIC,
        )
        storage_path = project.get_storage_path(
            'html',
            version_slug=version.slug,
            include_file=False,
            version_type=version.type,
        )
        file_path = build_media_storage.join(
            storage_path,
            filename,
        )
        try:
            with build_media_storage.open(file_path) as fd:  # pylint: disable=invalid-name
                return fd.read()
        except Exception:  # noqa
            log.warning('Unable to read file. file_path=%s', file_path)

        return None
Example #2
0
def _get_doc_content(project, version, doc):
    storage_path = project.get_storage_path(
        'json',
        version_slug=version.slug,
        include_file=False,
        version_type=version.type,
    )
    file_path = build_media_storage.join(
        storage_path,
        f'{doc}.fjson'.lstrip('/'),
    )
    try:
        with build_media_storage.open(file_path) as file:
            return json.load(file)
    except Exception:  # noqa
        log.warning('Unable to read file. file_path=%s', file_path)

    return None
Example #3
0
    def get(self, request, proxito_path, template_name='404.html'):
        """
        Handler for 404 pages on subdomains.

        This does a couple things:

        * Handles directory indexing for URLs that don't end in a slash
        * Handles directory indexing for README.html (for now)
        * Handles custom 404 serving

        For 404's, first search for a 404 page in the current version, then continues
        with the default version and finally, if none of them are found, the Read
        the Docs default page (Maze Found) is rendered by Django and served.
        """
        # pylint: disable=too-many-locals
        log.info('Executing 404 handler. proxito_path=%s', proxito_path)

        # Parse the URL using the normal urlconf, so we get proper subdomain/translation data
        _, __, kwargs = url_resolve(
            proxito_path,
            urlconf='readthedocs.proxito.urls',
        )

        version_slug = kwargs.get('version_slug')
        version_slug = self.get_version_from_host(request, version_slug)
        final_project, lang_slug, version_slug, filename = _get_project_data_from_request(  # noqa
            request,
            project_slug=kwargs.get('project_slug'),
            subproject_slug=kwargs.get('subproject_slug'),
            lang_slug=kwargs.get('lang_slug'),
            version_slug=version_slug,
            filename=kwargs.get('filename', ''),
        )

        storage_root_path = final_project.get_storage_path(
            type_='html',
            version_slug=version_slug,
            include_file=False,
            version_type=self.version_type,
        )

        # First, check for dirhtml with slash
        for tryfile in ('index.html', 'README.html'):
            storage_filename_path = build_media_storage.join(
                storage_root_path,
                f'{filename}/{tryfile}'.lstrip('/'),
            )
            log.debug(
                'Trying index filename: project=%s version=%s, file=%s',
                final_project.slug,
                version_slug,
                storage_filename_path,
            )
            if build_media_storage.exists(storage_filename_path):
                log.info(
                    'Redirecting to index file: project=%s version=%s, storage_path=%s',
                    final_project.slug,
                    version_slug,
                    storage_filename_path,
                )
                # Use urlparse so that we maintain GET args in our redirect
                parts = urlparse(proxito_path)
                if tryfile == 'README.html':
                    new_path = parts.path.rstrip('/') + f'/{tryfile}'
                else:
                    new_path = parts.path.rstrip('/') + '/'

                # `proxito_path` doesn't include query params.`
                query = urlparse(request.get_full_path()).query
                new_parts = parts._replace(
                    path=new_path,
                    query=query,
                )
                redirect_url = new_parts.geturl()

                # TODO: decide if we need to check for infinite redirect here
                # (from URL == to URL)
                return HttpResponseRedirect(redirect_url)

        # ``redirect_filename`` is the path without ``/<lang>/<version>`` and
        # without query, starting with a ``/``. This matches our old logic:
        # https://github.com/readthedocs/readthedocs.org/blob/4b09c7a0ab45cd894c3373f7f07bad7161e4b223/readthedocs/redirects/utils.py#L60
        # We parse ``filename`` to remove the query from it
        schema, netloc, path, params, query, fragments = urlparse(filename)
        redirect_filename = path

        # we can't check for lang and version here to decide if we need to add
        # the ``/`` or not because ``/install.html`` is a valid path to use as
        # redirect and does not include lang and version on it. It should be
        # fine always adding the ``/`` to the beginning.
        redirect_filename = '/' + redirect_filename.lstrip('/')

        # Check and perform redirects on 404 handler
        # NOTE: this redirect check must be done after trying files like
        # ``index.html`` and ``README.html`` to emulate the behavior we had when
        # serving directly from NGINX without passing through Python.
        redirect_path, http_status = self.get_redirect(
            project=final_project,
            lang_slug=lang_slug,
            version_slug=version_slug,
            filename=redirect_filename,
            full_path=proxito_path,
        )
        if redirect_path and http_status:
            try:
                return self.get_redirect_response(request, redirect_path,
                                                  proxito_path, http_status)
            except InfiniteRedirectException:
                # Continue with our normal 404 handling in this case
                pass

        # If that doesn't work, attempt to serve the 404 of the current version (version_slug)
        # Secondly, try to serve the 404 page for the default version
        # (project.get_default_version())
        doc_type = (Version.objects.filter(project=final_project,
                                           slug=version_slug).values_list(
                                               'documentation_type',
                                               flat=True).first())
        versions = [(version_slug, doc_type)]
        default_version_slug = final_project.get_default_version()
        if default_version_slug != version_slug:
            default_version_doc_type = (Version.objects.filter(
                project=final_project,
                slug=default_version_slug).values_list('documentation_type',
                                                       flat=True).first())
            versions.append((default_version_slug, default_version_doc_type))

        for version_slug_404, doc_type_404 in versions:
            if not self.allowed_user(request, final_project, version_slug_404):
                continue

            storage_root_path = final_project.get_storage_path(
                type_='html',
                version_slug=version_slug_404,
                include_file=False,
                version_type=self.version_type,
            )
            tryfiles = ['404.html']
            # SPHINX_HTMLDIR is the only builder
            # that could output a 404/index.html file.
            if doc_type_404 == SPHINX_HTMLDIR:
                tryfiles.append('404/index.html')
            for tryfile in tryfiles:
                storage_filename_path = build_media_storage.join(
                    storage_root_path, tryfile)
                if build_media_storage.exists(storage_filename_path):
                    log.info(
                        'Serving custom 404.html page: [project: %s] [version: %s]',
                        final_project.slug,
                        version_slug_404,
                    )
                    resp = HttpResponse(
                        build_media_storage.open(storage_filename_path).read())
                    resp.status_code = 404
                    return resp

        raise Http404('No custom 404 page found.')
Example #4
0
def _create_intersphinx_data(version, commit, build):
    """
    Create intersphinx data for this version.

    :param version: Version instance
    :param commit: Commit that updated path
    :param build: Build id
    """
    if not version.is_sphinx_type:
        return

    html_storage_path = version.project.get_storage_path(
        type_='html', version_slug=version.slug, include_file=False
    )
    json_storage_path = version.project.get_storage_path(
        type_='json', version_slug=version.slug, include_file=False
    )

    object_file = build_media_storage.join(html_storage_path, 'objects.inv')
    if not build_media_storage.exists(object_file):
        log.debug('No objects.inv, skipping intersphinx indexing.')
        return

    type_file = build_media_storage.join(json_storage_path, 'readthedocs-sphinx-domain-names.json')
    types = {}
    titles = {}
    if build_media_storage.exists(type_file):
        try:
            data = json.load(build_media_storage.open(type_file))
            types = data['types']
            titles = data['titles']
        except Exception:
            log.exception('Exception parsing readthedocs-sphinx-domain-names.json')

    # These classes are copied from Sphinx
    # https://github.com/sphinx-doc/sphinx/blob/d79d041f4f90818e0b495523fdcc28db12783caf/sphinx/ext/intersphinx.py#L400-L403  # noqa
    class MockConfig:
        intersphinx_timeout = None
        tls_verify = False
        user_agent = None

    class MockApp:
        srcdir = ''
        config = MockConfig()

        def warn(self, msg):
            log.warning('Sphinx MockApp.', msg=msg)

    # Re-create all objects from the new build of the version
    object_file_url = build_media_storage.url(object_file)
    if object_file_url.startswith('/'):
        # Filesystem backed storage simply prepends MEDIA_URL to the path to get the URL
        # This can cause an issue if MEDIA_URL is not fully qualified
        object_file_url = settings.RTD_INTERSPHINX_URL + object_file_url

    invdata = intersphinx.fetch_inventory(MockApp(), '', object_file_url)
    for key, value in sorted(invdata.items() or {}):
        domain, _type = key.split(':', 1)
        for name, einfo in sorted(value.items()):
            # project, version, url, display_name
            # ('Sphinx', '1.7.9', 'faq.html#epub-faq', 'Epub info')
            try:
                url = einfo[2]
                if '#' in url:
                    doc_name, anchor = url.split(
                        '#',
                        # The anchor can contain ``#`` characters
                        maxsplit=1
                    )
                else:
                    doc_name, anchor = url, ''
                display_name = einfo[3]
            except Exception:
                log.exception(
                    'Error while getting sphinx domain information. Skipping...',
                    project_slug=version.project.slug,
                    version_slug=version.slug,
                    sphinx_domain='{domain}->{name}',
                )
                continue

            # HACK: This is done because the difference between
            # ``sphinx.builders.html.StandaloneHTMLBuilder``
            # and ``sphinx.builders.dirhtml.DirectoryHTMLBuilder``.
            # They both have different ways of generating HTML Files,
            # and therefore the doc_name generated is different.
            # More info on: http://www.sphinx-doc.org/en/master/usage/builders/index.html#builders
            # Also see issue: https://github.com/readthedocs/readthedocs.org/issues/5821
            if doc_name.endswith('/'):
                doc_name += 'index.html'

            html_file = HTMLFile.objects.filter(
                project=version.project, version=version,
                path=doc_name, build=build,
            ).first()

            if not html_file:
                log.debug(
                    'HTMLFile object not found.',
                    project_slug=version.project.slug,
                    version_slug=version.slug,
                    build_id=build,
                    doc_name=doc_name
                )

                # Don't create Sphinx Domain objects
                # if the HTMLFile object is not found.
                continue

            SphinxDomain.objects.create(
                project=version.project,
                version=version,
                html_file=html_file,
                domain=domain,
                name=name,
                display_name=display_name,
                type=_type,
                type_display=types.get(f'{domain}:{_type}', ''),
                doc_name=doc_name,
                doc_display=titles.get(doc_name, ''),
                anchor=anchor,
                commit=commit,
                build=build,
            )