Example #1
0
    def _get_page_content_from_storage(self, project, version_slug, filename):
        version = get_object_or_404(
            project.versions,
            slug=version_slug,
            # Only allow PUBLIC versions when getting the content from our
            # storage for privacy/security reasons
            privacy_level=PUBLIC,
        )
        storage_path = project.get_storage_path(
            'html',
            version_slug=version.slug,
            include_file=False,
            version_type=version.type,
        )
        file_path = build_media_storage.join(
            storage_path,
            filename,
        )
        try:
            with build_media_storage.open(file_path) as fd:  # pylint: disable=invalid-name
                return fd.read()
        except Exception:  # noqa
            log.warning('Unable to read file. file_path=%s', file_path)

        return None
Example #2
0
def _create_imported_files(*, version, commit, build, search_ranking, search_ignore):
    """
    Create imported files for version.

    :param version: Version instance
    :param commit: Commit that updated path
    :param build: Build id
    """
    # Re-create all objects from the new build of the version
    storage_path = version.project.get_storage_path(
        type_='html', version_slug=version.slug, include_file=False
    )
    for root, __, filenames in build_media_storage.walk(storage_path):
        for filename in filenames:
            # We don't care about non-HTML files
            if not filename.endswith('.html'):
                continue

            full_path = build_media_storage.join(root, filename)

            # Generate a relative path for storage similar to os.path.relpath
            relpath = full_path.replace(storage_path, '', 1).lstrip('/')

            page_rank = 0
            # Last pattern to match takes precedence
            # XXX: see if we can implement another type of precedence,
            # like the longest pattern.
            reverse_rankings = reversed(list(search_ranking.items()))
            for pattern, rank in reverse_rankings:
                if fnmatch(relpath, pattern):
                    page_rank = rank
                    break

            ignore = False
            for pattern in search_ignore:
                if fnmatch(relpath, pattern):
                    ignore = True
                    break

            # Create imported files from new build
            HTMLFile.objects.create(
                project=version.project,
                version=version,
                path=relpath,
                name=filename,
                rank=page_rank,
                commit=commit,
                build=build,
                ignore=ignore,
            )

    # This signal is used for purging the CDN.
    files_changed.send(
        sender=Project,
        project=version.project,
        version=version,
    )
Example #3
0
    def get(self, request, project):
        """
        Serve custom user's defined ``/robots.txt``.

        If the user added a ``robots.txt`` in the "default version" of the
        project, we serve it directly.
        """

        # Use the ``robots.txt`` file from the default version configured
        version_slug = project.get_default_version()
        version = project.versions.get(slug=version_slug)

        no_serve_robots_txt = any([
            # If the default version is private or,
            version.privacy_level == constants.PRIVATE,
            # default version is not active or,
            not version.active,
            # default version is not built
            not version.built,
        ])

        if no_serve_robots_txt:
            # ... we do return a 404
            raise Http404()

        storage_path = project.get_storage_path(
            type_='html',
            version_slug=version_slug,
            include_file=False,
            version_type=self.version_type,
        )
        path = build_media_storage.join(storage_path, 'robots.txt')

        if build_media_storage.exists(path):
            url = build_media_storage.url(path)
            url = urlparse(url)._replace(scheme='', netloc='').geturl()
            return self._serve_docs(
                request,
                final_project=project,
                path=url,
            )

        sitemap_url = '{scheme}://{domain}/sitemap.xml'.format(
            scheme='https',
            domain=project.subdomain(),
        )
        context = {
            'sitemap_url': sitemap_url,
            'hidden_paths': self._get_hidden_paths(project),
        }
        return render(
            request,
            'robots.txt',
            context,
            content_type='text/plain',
        )
Example #4
0
def _get_doc_content(project, version, doc):
    storage_path = project.get_storage_path(
        'json',
        version_slug=version.slug,
        include_file=False,
        version_type=version.type,
    )
    file_path = build_media_storage.join(
        storage_path,
        f'{doc}.fjson'.lstrip('/'),
    )
    try:
        with build_media_storage.open(file_path) as file:
            return json.load(file)
    except Exception:  # noqa
        log.warning('Unable to read file. file_path=%s', file_path)

    return None
Example #5
0
    def get(
        self,
        request,
        project_slug=None,
        subproject_slug=None,
        subproject_slash=None,
        lang_slug=None,
        version_slug=None,
        filename='',
    ):  # noqa
        """
        Take the incoming parsed URL's and figure out what file to serve.

        ``subproject_slash`` is used to determine if the subproject URL has a slash,
        so that we can decide if we need to serve docs or add a /.
        """

        version_slug = self.get_version_from_host(request, version_slug)
        final_project, lang_slug, version_slug, filename = _get_project_data_from_request(  # noqa
            request,
            project_slug=project_slug,
            subproject_slug=subproject_slug,
            lang_slug=lang_slug,
            version_slug=version_slug,
            filename=filename,
        )

        log.debug(
            'Serving docs: project=%s, subproject=%s, lang_slug=%s, version_slug=%s, filename=%s',
            final_project.slug, subproject_slug, lang_slug, version_slug,
            filename)

        # Handle requests that need canonicalizing (eg. HTTP -> HTTPS, redirect to canonical domain)
        if hasattr(request, 'canonicalize'):
            try:
                return self.canonical_redirect(request, final_project,
                                               version_slug, filename)
            except InfiniteRedirectException:
                # Don't redirect in this case, since it would break things
                pass

        # Handle a / redirect when we aren't a single version
        if all([
                lang_slug is None,
                # External versions/builds will always have a version,
                # because it is taken from the host name
                version_slug is None or hasattr(request, 'external_domain'),
                filename == '',
                not final_project.single_version,
        ]):
            return self.system_redirect(request, final_project, lang_slug,
                                        version_slug, filename)

        # Handle `/projects/subproject` URL redirection:
        # when there _is_ a subproject_slug but not a subproject_slash
        if all([
                final_project.single_version,
                filename == '',
                subproject_slug,
                not subproject_slash,
        ]):
            return self.system_redirect(request, final_project, lang_slug,
                                        version_slug, filename)

        if all([
            (lang_slug is None or version_slug is None),
                not final_project.single_version,
                self.version_type != EXTERNAL,
        ]):
            log.warning(
                'Invalid URL for project with versions. url=%s, project=%s',
                filename, final_project.slug)
            raise Http404('Invalid URL for project with versions')

        # TODO: un-comment when ready to perform redirect here
        # redirect_path, http_status = self.get_redirect(
        #     final_project,
        #     lang_slug,
        #     version_slug,
        #     filename,
        #     request.path,
        # )
        # if redirect_path and http_status:
        #     return self.get_redirect_response(request, redirect_path, http_status)

        # Check user permissions and return an unauthed response if needed
        if not self.allowed_user(request, final_project, version_slug):
            return self.get_unauthed_response(request, final_project)

        storage_path = final_project.get_storage_path(
            type_='html',
            version_slug=version_slug,
            include_file=False,
            version_type=self.version_type,
        )

        # If ``filename`` is empty, serve from ``/``
        path = build_media_storage.join(storage_path, filename.lstrip('/'))
        # Handle our backend storage not supporting directory indexes,
        # so we need to append index.html when appropriate.
        if path[-1] == '/':
            # We need to add the index.html before ``storage.url`` since the
            # Signature and Expire time is calculated per file.
            path += 'index.html'

        # NOTE: calling ``.url`` will remove the trailing slash
        storage_url = build_media_storage.url(path, http_method=request.method)

        # URL without scheme and domain to perform an NGINX internal redirect
        parsed_url = urlparse(storage_url)._replace(scheme='', netloc='')
        final_url = parsed_url.geturl()

        return self._serve_docs(
            request,
            final_project=final_project,
            version_slug=version_slug,
            path=final_url,
        )
Example #6
0
    def get(self, request, proxito_path, template_name='404.html'):
        """
        Handler for 404 pages on subdomains.

        This does a couple things:

        * Handles directory indexing for URLs that don't end in a slash
        * Handles directory indexing for README.html (for now)
        * Handles custom 404 serving

        For 404's, first search for a 404 page in the current version, then continues
        with the default version and finally, if none of them are found, the Read
        the Docs default page (Maze Found) is rendered by Django and served.
        """
        # pylint: disable=too-many-locals
        log.info('Executing 404 handler. proxito_path=%s', proxito_path)

        # Parse the URL using the normal urlconf, so we get proper subdomain/translation data
        _, __, kwargs = url_resolve(
            proxito_path,
            urlconf='readthedocs.proxito.urls',
        )

        version_slug = kwargs.get('version_slug')
        version_slug = self.get_version_from_host(request, version_slug)
        final_project, lang_slug, version_slug, filename = _get_project_data_from_request(  # noqa
            request,
            project_slug=kwargs.get('project_slug'),
            subproject_slug=kwargs.get('subproject_slug'),
            lang_slug=kwargs.get('lang_slug'),
            version_slug=version_slug,
            filename=kwargs.get('filename', ''),
        )

        storage_root_path = final_project.get_storage_path(
            type_='html',
            version_slug=version_slug,
            include_file=False,
            version_type=self.version_type,
        )

        # First, check for dirhtml with slash
        for tryfile in ('index.html', 'README.html'):
            storage_filename_path = build_media_storage.join(
                storage_root_path,
                f'{filename}/{tryfile}'.lstrip('/'),
            )
            log.debug(
                'Trying index filename: project=%s version=%s, file=%s',
                final_project.slug,
                version_slug,
                storage_filename_path,
            )
            if build_media_storage.exists(storage_filename_path):
                log.info(
                    'Redirecting to index file: project=%s version=%s, storage_path=%s',
                    final_project.slug,
                    version_slug,
                    storage_filename_path,
                )
                # Use urlparse so that we maintain GET args in our redirect
                parts = urlparse(proxito_path)
                if tryfile == 'README.html':
                    new_path = parts.path.rstrip('/') + f'/{tryfile}'
                else:
                    new_path = parts.path.rstrip('/') + '/'

                # `proxito_path` doesn't include query params.`
                query = urlparse(request.get_full_path()).query
                new_parts = parts._replace(
                    path=new_path,
                    query=query,
                )
                redirect_url = new_parts.geturl()

                # TODO: decide if we need to check for infinite redirect here
                # (from URL == to URL)
                return HttpResponseRedirect(redirect_url)

        # ``redirect_filename`` is the path without ``/<lang>/<version>`` and
        # without query, starting with a ``/``. This matches our old logic:
        # https://github.com/readthedocs/readthedocs.org/blob/4b09c7a0ab45cd894c3373f7f07bad7161e4b223/readthedocs/redirects/utils.py#L60
        # We parse ``filename`` to remove the query from it
        schema, netloc, path, params, query, fragments = urlparse(filename)
        redirect_filename = path

        # we can't check for lang and version here to decide if we need to add
        # the ``/`` or not because ``/install.html`` is a valid path to use as
        # redirect and does not include lang and version on it. It should be
        # fine always adding the ``/`` to the beginning.
        redirect_filename = '/' + redirect_filename.lstrip('/')

        # Check and perform redirects on 404 handler
        # NOTE: this redirect check must be done after trying files like
        # ``index.html`` and ``README.html`` to emulate the behavior we had when
        # serving directly from NGINX without passing through Python.
        redirect_path, http_status = self.get_redirect(
            project=final_project,
            lang_slug=lang_slug,
            version_slug=version_slug,
            filename=redirect_filename,
            full_path=proxito_path,
        )
        if redirect_path and http_status:
            try:
                return self.get_redirect_response(request, redirect_path,
                                                  proxito_path, http_status)
            except InfiniteRedirectException:
                # Continue with our normal 404 handling in this case
                pass

        # If that doesn't work, attempt to serve the 404 of the current version (version_slug)
        # Secondly, try to serve the 404 page for the default version
        # (project.get_default_version())
        doc_type = (Version.objects.filter(project=final_project,
                                           slug=version_slug).values_list(
                                               'documentation_type',
                                               flat=True).first())
        versions = [(version_slug, doc_type)]
        default_version_slug = final_project.get_default_version()
        if default_version_slug != version_slug:
            default_version_doc_type = (Version.objects.filter(
                project=final_project,
                slug=default_version_slug).values_list('documentation_type',
                                                       flat=True).first())
            versions.append((default_version_slug, default_version_doc_type))

        for version_slug_404, doc_type_404 in versions:
            if not self.allowed_user(request, final_project, version_slug_404):
                continue

            storage_root_path = final_project.get_storage_path(
                type_='html',
                version_slug=version_slug_404,
                include_file=False,
                version_type=self.version_type,
            )
            tryfiles = ['404.html']
            # SPHINX_HTMLDIR is the only builder
            # that could output a 404/index.html file.
            if doc_type_404 == SPHINX_HTMLDIR:
                tryfiles.append('404/index.html')
            for tryfile in tryfiles:
                storage_filename_path = build_media_storage.join(
                    storage_root_path, tryfile)
                if build_media_storage.exists(storage_filename_path):
                    log.info(
                        'Serving custom 404.html page: [project: %s] [version: %s]',
                        final_project.slug,
                        version_slug_404,
                    )
                    resp = HttpResponse(
                        build_media_storage.open(storage_filename_path).read())
                    resp.status_code = 404
                    return resp

        raise Http404('No custom 404 page found.')
Example #7
0
def _create_intersphinx_data(version, commit, build):
    """
    Create intersphinx data for this version.

    :param version: Version instance
    :param commit: Commit that updated path
    :param build: Build id
    """
    if not version.is_sphinx_type:
        return

    html_storage_path = version.project.get_storage_path(
        type_='html', version_slug=version.slug, include_file=False
    )
    json_storage_path = version.project.get_storage_path(
        type_='json', version_slug=version.slug, include_file=False
    )

    object_file = build_media_storage.join(html_storage_path, 'objects.inv')
    if not build_media_storage.exists(object_file):
        log.debug('No objects.inv, skipping intersphinx indexing.')
        return

    type_file = build_media_storage.join(json_storage_path, 'readthedocs-sphinx-domain-names.json')
    types = {}
    titles = {}
    if build_media_storage.exists(type_file):
        try:
            data = json.load(build_media_storage.open(type_file))
            types = data['types']
            titles = data['titles']
        except Exception:
            log.exception('Exception parsing readthedocs-sphinx-domain-names.json')

    # These classes are copied from Sphinx
    # https://github.com/sphinx-doc/sphinx/blob/d79d041f4f90818e0b495523fdcc28db12783caf/sphinx/ext/intersphinx.py#L400-L403  # noqa
    class MockConfig:
        intersphinx_timeout = None
        tls_verify = False
        user_agent = None

    class MockApp:
        srcdir = ''
        config = MockConfig()

        def warn(self, msg):
            log.warning('Sphinx MockApp.', msg=msg)

    # Re-create all objects from the new build of the version
    object_file_url = build_media_storage.url(object_file)
    if object_file_url.startswith('/'):
        # Filesystem backed storage simply prepends MEDIA_URL to the path to get the URL
        # This can cause an issue if MEDIA_URL is not fully qualified
        object_file_url = settings.RTD_INTERSPHINX_URL + object_file_url

    invdata = intersphinx.fetch_inventory(MockApp(), '', object_file_url)
    for key, value in sorted(invdata.items() or {}):
        domain, _type = key.split(':', 1)
        for name, einfo in sorted(value.items()):
            # project, version, url, display_name
            # ('Sphinx', '1.7.9', 'faq.html#epub-faq', 'Epub info')
            try:
                url = einfo[2]
                if '#' in url:
                    doc_name, anchor = url.split(
                        '#',
                        # The anchor can contain ``#`` characters
                        maxsplit=1
                    )
                else:
                    doc_name, anchor = url, ''
                display_name = einfo[3]
            except Exception:
                log.exception(
                    'Error while getting sphinx domain information. Skipping...',
                    project_slug=version.project.slug,
                    version_slug=version.slug,
                    sphinx_domain='{domain}->{name}',
                )
                continue

            # HACK: This is done because the difference between
            # ``sphinx.builders.html.StandaloneHTMLBuilder``
            # and ``sphinx.builders.dirhtml.DirectoryHTMLBuilder``.
            # They both have different ways of generating HTML Files,
            # and therefore the doc_name generated is different.
            # More info on: http://www.sphinx-doc.org/en/master/usage/builders/index.html#builders
            # Also see issue: https://github.com/readthedocs/readthedocs.org/issues/5821
            if doc_name.endswith('/'):
                doc_name += 'index.html'

            html_file = HTMLFile.objects.filter(
                project=version.project, version=version,
                path=doc_name, build=build,
            ).first()

            if not html_file:
                log.debug(
                    'HTMLFile object not found.',
                    project_slug=version.project.slug,
                    version_slug=version.slug,
                    build_id=build,
                    doc_name=doc_name
                )

                # Don't create Sphinx Domain objects
                # if the HTMLFile object is not found.
                continue

            SphinxDomain.objects.create(
                project=version.project,
                version=version,
                html_file=html_file,
                domain=domain,
                name=name,
                display_name=display_name,
                type=_type,
                type_display=types.get(f'{domain}:{_type}', ''),
                doc_name=doc_name,
                doc_display=titles.get(doc_name, ''),
                anchor=anchor,
                commit=commit,
                build=build,
            )