def _get_page_content_from_storage(self, project, version_slug, filename): version = get_object_or_404( project.versions, slug=version_slug, # Only allow PUBLIC versions when getting the content from our # storage for privacy/security reasons privacy_level=PUBLIC, ) storage_path = project.get_storage_path( 'html', version_slug=version.slug, include_file=False, version_type=version.type, ) file_path = build_media_storage.join( storage_path, filename, ) try: with build_media_storage.open(file_path) as fd: # pylint: disable=invalid-name return fd.read() except Exception: # noqa log.warning('Unable to read file. file_path=%s', file_path) return None
def _get_doc_content(project, version, doc): storage_path = project.get_storage_path( 'json', version_slug=version.slug, include_file=False, version_type=version.type, ) file_path = build_media_storage.join( storage_path, f'{doc}.fjson'.lstrip('/'), ) try: with build_media_storage.open(file_path) as file: return json.load(file) except Exception: # noqa log.warning('Unable to read file. file_path=%s', file_path) return None
def get(self, request, proxito_path, template_name='404.html'): """ Handler for 404 pages on subdomains. This does a couple things: * Handles directory indexing for URLs that don't end in a slash * Handles directory indexing for README.html (for now) * Handles custom 404 serving For 404's, first search for a 404 page in the current version, then continues with the default version and finally, if none of them are found, the Read the Docs default page (Maze Found) is rendered by Django and served. """ # pylint: disable=too-many-locals log.info('Executing 404 handler. proxito_path=%s', proxito_path) # Parse the URL using the normal urlconf, so we get proper subdomain/translation data _, __, kwargs = url_resolve( proxito_path, urlconf='readthedocs.proxito.urls', ) version_slug = kwargs.get('version_slug') version_slug = self.get_version_from_host(request, version_slug) final_project, lang_slug, version_slug, filename = _get_project_data_from_request( # noqa request, project_slug=kwargs.get('project_slug'), subproject_slug=kwargs.get('subproject_slug'), lang_slug=kwargs.get('lang_slug'), version_slug=version_slug, filename=kwargs.get('filename', ''), ) storage_root_path = final_project.get_storage_path( type_='html', version_slug=version_slug, include_file=False, version_type=self.version_type, ) # First, check for dirhtml with slash for tryfile in ('index.html', 'README.html'): storage_filename_path = build_media_storage.join( storage_root_path, f'{filename}/{tryfile}'.lstrip('/'), ) log.debug( 'Trying index filename: project=%s version=%s, file=%s', final_project.slug, version_slug, storage_filename_path, ) if build_media_storage.exists(storage_filename_path): log.info( 'Redirecting to index file: project=%s version=%s, storage_path=%s', final_project.slug, version_slug, storage_filename_path, ) # Use urlparse so that we maintain GET args in our redirect parts = urlparse(proxito_path) if tryfile == 'README.html': new_path = parts.path.rstrip('/') + f'/{tryfile}' else: new_path = parts.path.rstrip('/') + '/' # `proxito_path` doesn't include query params.` query = urlparse(request.get_full_path()).query new_parts = parts._replace( path=new_path, query=query, ) redirect_url = new_parts.geturl() # TODO: decide if we need to check for infinite redirect here # (from URL == to URL) return HttpResponseRedirect(redirect_url) # ``redirect_filename`` is the path without ``/<lang>/<version>`` and # without query, starting with a ``/``. This matches our old logic: # https://github.com/readthedocs/readthedocs.org/blob/4b09c7a0ab45cd894c3373f7f07bad7161e4b223/readthedocs/redirects/utils.py#L60 # We parse ``filename`` to remove the query from it schema, netloc, path, params, query, fragments = urlparse(filename) redirect_filename = path # we can't check for lang and version here to decide if we need to add # the ``/`` or not because ``/install.html`` is a valid path to use as # redirect and does not include lang and version on it. It should be # fine always adding the ``/`` to the beginning. redirect_filename = '/' + redirect_filename.lstrip('/') # Check and perform redirects on 404 handler # NOTE: this redirect check must be done after trying files like # ``index.html`` and ``README.html`` to emulate the behavior we had when # serving directly from NGINX without passing through Python. redirect_path, http_status = self.get_redirect( project=final_project, lang_slug=lang_slug, version_slug=version_slug, filename=redirect_filename, full_path=proxito_path, ) if redirect_path and http_status: try: return self.get_redirect_response(request, redirect_path, proxito_path, http_status) except InfiniteRedirectException: # Continue with our normal 404 handling in this case pass # If that doesn't work, attempt to serve the 404 of the current version (version_slug) # Secondly, try to serve the 404 page for the default version # (project.get_default_version()) doc_type = (Version.objects.filter(project=final_project, slug=version_slug).values_list( 'documentation_type', flat=True).first()) versions = [(version_slug, doc_type)] default_version_slug = final_project.get_default_version() if default_version_slug != version_slug: default_version_doc_type = (Version.objects.filter( project=final_project, slug=default_version_slug).values_list('documentation_type', flat=True).first()) versions.append((default_version_slug, default_version_doc_type)) for version_slug_404, doc_type_404 in versions: if not self.allowed_user(request, final_project, version_slug_404): continue storage_root_path = final_project.get_storage_path( type_='html', version_slug=version_slug_404, include_file=False, version_type=self.version_type, ) tryfiles = ['404.html'] # SPHINX_HTMLDIR is the only builder # that could output a 404/index.html file. if doc_type_404 == SPHINX_HTMLDIR: tryfiles.append('404/index.html') for tryfile in tryfiles: storage_filename_path = build_media_storage.join( storage_root_path, tryfile) if build_media_storage.exists(storage_filename_path): log.info( 'Serving custom 404.html page: [project: %s] [version: %s]', final_project.slug, version_slug_404, ) resp = HttpResponse( build_media_storage.open(storage_filename_path).read()) resp.status_code = 404 return resp raise Http404('No custom 404 page found.')
def _create_intersphinx_data(version, commit, build): """ Create intersphinx data for this version. :param version: Version instance :param commit: Commit that updated path :param build: Build id """ if not version.is_sphinx_type: return html_storage_path = version.project.get_storage_path( type_='html', version_slug=version.slug, include_file=False ) json_storage_path = version.project.get_storage_path( type_='json', version_slug=version.slug, include_file=False ) object_file = build_media_storage.join(html_storage_path, 'objects.inv') if not build_media_storage.exists(object_file): log.debug('No objects.inv, skipping intersphinx indexing.') return type_file = build_media_storage.join(json_storage_path, 'readthedocs-sphinx-domain-names.json') types = {} titles = {} if build_media_storage.exists(type_file): try: data = json.load(build_media_storage.open(type_file)) types = data['types'] titles = data['titles'] except Exception: log.exception('Exception parsing readthedocs-sphinx-domain-names.json') # These classes are copied from Sphinx # https://github.com/sphinx-doc/sphinx/blob/d79d041f4f90818e0b495523fdcc28db12783caf/sphinx/ext/intersphinx.py#L400-L403 # noqa class MockConfig: intersphinx_timeout = None tls_verify = False user_agent = None class MockApp: srcdir = '' config = MockConfig() def warn(self, msg): log.warning('Sphinx MockApp.', msg=msg) # Re-create all objects from the new build of the version object_file_url = build_media_storage.url(object_file) if object_file_url.startswith('/'): # Filesystem backed storage simply prepends MEDIA_URL to the path to get the URL # This can cause an issue if MEDIA_URL is not fully qualified object_file_url = settings.RTD_INTERSPHINX_URL + object_file_url invdata = intersphinx.fetch_inventory(MockApp(), '', object_file_url) for key, value in sorted(invdata.items() or {}): domain, _type = key.split(':', 1) for name, einfo in sorted(value.items()): # project, version, url, display_name # ('Sphinx', '1.7.9', 'faq.html#epub-faq', 'Epub info') try: url = einfo[2] if '#' in url: doc_name, anchor = url.split( '#', # The anchor can contain ``#`` characters maxsplit=1 ) else: doc_name, anchor = url, '' display_name = einfo[3] except Exception: log.exception( 'Error while getting sphinx domain information. Skipping...', project_slug=version.project.slug, version_slug=version.slug, sphinx_domain='{domain}->{name}', ) continue # HACK: This is done because the difference between # ``sphinx.builders.html.StandaloneHTMLBuilder`` # and ``sphinx.builders.dirhtml.DirectoryHTMLBuilder``. # They both have different ways of generating HTML Files, # and therefore the doc_name generated is different. # More info on: http://www.sphinx-doc.org/en/master/usage/builders/index.html#builders # Also see issue: https://github.com/readthedocs/readthedocs.org/issues/5821 if doc_name.endswith('/'): doc_name += 'index.html' html_file = HTMLFile.objects.filter( project=version.project, version=version, path=doc_name, build=build, ).first() if not html_file: log.debug( 'HTMLFile object not found.', project_slug=version.project.slug, version_slug=version.slug, build_id=build, doc_name=doc_name ) # Don't create Sphinx Domain objects # if the HTMLFile object is not found. continue SphinxDomain.objects.create( project=version.project, version=version, html_file=html_file, domain=domain, name=name, display_name=display_name, type=_type, type_display=types.get(f'{domain}:{_type}', ''), doc_name=doc_name, doc_display=titles.get(doc_name, ''), anchor=anchor, commit=commit, build=build, )