def _get_page_content_from_storage(self, project, version_slug, filename): version = get_object_or_404( project.versions, slug=version_slug, # Only allow PUBLIC versions when getting the content from our # storage for privacy/security reasons privacy_level=PUBLIC, ) storage_path = project.get_storage_path( 'html', version_slug=version.slug, include_file=False, version_type=version.type, ) file_path = build_media_storage.join( storage_path, filename, ) try: with build_media_storage.open(file_path) as fd: # pylint: disable=invalid-name return fd.read() except Exception: # noqa log.warning('Unable to read file. file_path=%s', file_path) return None
def _create_imported_files(*, version, commit, build, search_ranking, search_ignore): """ Create imported files for version. :param version: Version instance :param commit: Commit that updated path :param build: Build id """ # Re-create all objects from the new build of the version storage_path = version.project.get_storage_path( type_='html', version_slug=version.slug, include_file=False ) for root, __, filenames in build_media_storage.walk(storage_path): for filename in filenames: # We don't care about non-HTML files if not filename.endswith('.html'): continue full_path = build_media_storage.join(root, filename) # Generate a relative path for storage similar to os.path.relpath relpath = full_path.replace(storage_path, '', 1).lstrip('/') page_rank = 0 # Last pattern to match takes precedence # XXX: see if we can implement another type of precedence, # like the longest pattern. reverse_rankings = reversed(list(search_ranking.items())) for pattern, rank in reverse_rankings: if fnmatch(relpath, pattern): page_rank = rank break ignore = False for pattern in search_ignore: if fnmatch(relpath, pattern): ignore = True break # Create imported files from new build HTMLFile.objects.create( project=version.project, version=version, path=relpath, name=filename, rank=page_rank, commit=commit, build=build, ignore=ignore, ) # This signal is used for purging the CDN. files_changed.send( sender=Project, project=version.project, version=version, )
def get(self, request, project): """ Serve custom user's defined ``/robots.txt``. If the user added a ``robots.txt`` in the "default version" of the project, we serve it directly. """ # Use the ``robots.txt`` file from the default version configured version_slug = project.get_default_version() version = project.versions.get(slug=version_slug) no_serve_robots_txt = any([ # If the default version is private or, version.privacy_level == constants.PRIVATE, # default version is not active or, not version.active, # default version is not built not version.built, ]) if no_serve_robots_txt: # ... we do return a 404 raise Http404() storage_path = project.get_storage_path( type_='html', version_slug=version_slug, include_file=False, version_type=self.version_type, ) path = build_media_storage.join(storage_path, 'robots.txt') if build_media_storage.exists(path): url = build_media_storage.url(path) url = urlparse(url)._replace(scheme='', netloc='').geturl() return self._serve_docs( request, final_project=project, path=url, ) sitemap_url = '{scheme}://{domain}/sitemap.xml'.format( scheme='https', domain=project.subdomain(), ) context = { 'sitemap_url': sitemap_url, 'hidden_paths': self._get_hidden_paths(project), } return render( request, 'robots.txt', context, content_type='text/plain', )
def _get_doc_content(project, version, doc): storage_path = project.get_storage_path( 'json', version_slug=version.slug, include_file=False, version_type=version.type, ) file_path = build_media_storage.join( storage_path, f'{doc}.fjson'.lstrip('/'), ) try: with build_media_storage.open(file_path) as file: return json.load(file) except Exception: # noqa log.warning('Unable to read file. file_path=%s', file_path) return None
def get( self, request, project_slug=None, subproject_slug=None, subproject_slash=None, lang_slug=None, version_slug=None, filename='', ): # noqa """ Take the incoming parsed URL's and figure out what file to serve. ``subproject_slash`` is used to determine if the subproject URL has a slash, so that we can decide if we need to serve docs or add a /. """ version_slug = self.get_version_from_host(request, version_slug) final_project, lang_slug, version_slug, filename = _get_project_data_from_request( # noqa request, project_slug=project_slug, subproject_slug=subproject_slug, lang_slug=lang_slug, version_slug=version_slug, filename=filename, ) log.debug( 'Serving docs: project=%s, subproject=%s, lang_slug=%s, version_slug=%s, filename=%s', final_project.slug, subproject_slug, lang_slug, version_slug, filename) # Handle requests that need canonicalizing (eg. HTTP -> HTTPS, redirect to canonical domain) if hasattr(request, 'canonicalize'): try: return self.canonical_redirect(request, final_project, version_slug, filename) except InfiniteRedirectException: # Don't redirect in this case, since it would break things pass # Handle a / redirect when we aren't a single version if all([ lang_slug is None, # External versions/builds will always have a version, # because it is taken from the host name version_slug is None or hasattr(request, 'external_domain'), filename == '', not final_project.single_version, ]): return self.system_redirect(request, final_project, lang_slug, version_slug, filename) # Handle `/projects/subproject` URL redirection: # when there _is_ a subproject_slug but not a subproject_slash if all([ final_project.single_version, filename == '', subproject_slug, not subproject_slash, ]): return self.system_redirect(request, final_project, lang_slug, version_slug, filename) if all([ (lang_slug is None or version_slug is None), not final_project.single_version, self.version_type != EXTERNAL, ]): log.warning( 'Invalid URL for project with versions. url=%s, project=%s', filename, final_project.slug) raise Http404('Invalid URL for project with versions') # TODO: un-comment when ready to perform redirect here # redirect_path, http_status = self.get_redirect( # final_project, # lang_slug, # version_slug, # filename, # request.path, # ) # if redirect_path and http_status: # return self.get_redirect_response(request, redirect_path, http_status) # Check user permissions and return an unauthed response if needed if not self.allowed_user(request, final_project, version_slug): return self.get_unauthed_response(request, final_project) storage_path = final_project.get_storage_path( type_='html', version_slug=version_slug, include_file=False, version_type=self.version_type, ) # If ``filename`` is empty, serve from ``/`` path = build_media_storage.join(storage_path, filename.lstrip('/')) # Handle our backend storage not supporting directory indexes, # so we need to append index.html when appropriate. if path[-1] == '/': # We need to add the index.html before ``storage.url`` since the # Signature and Expire time is calculated per file. path += 'index.html' # NOTE: calling ``.url`` will remove the trailing slash storage_url = build_media_storage.url(path, http_method=request.method) # URL without scheme and domain to perform an NGINX internal redirect parsed_url = urlparse(storage_url)._replace(scheme='', netloc='') final_url = parsed_url.geturl() return self._serve_docs( request, final_project=final_project, version_slug=version_slug, path=final_url, )
def get(self, request, proxito_path, template_name='404.html'): """ Handler for 404 pages on subdomains. This does a couple things: * Handles directory indexing for URLs that don't end in a slash * Handles directory indexing for README.html (for now) * Handles custom 404 serving For 404's, first search for a 404 page in the current version, then continues with the default version and finally, if none of them are found, the Read the Docs default page (Maze Found) is rendered by Django and served. """ # pylint: disable=too-many-locals log.info('Executing 404 handler. proxito_path=%s', proxito_path) # Parse the URL using the normal urlconf, so we get proper subdomain/translation data _, __, kwargs = url_resolve( proxito_path, urlconf='readthedocs.proxito.urls', ) version_slug = kwargs.get('version_slug') version_slug = self.get_version_from_host(request, version_slug) final_project, lang_slug, version_slug, filename = _get_project_data_from_request( # noqa request, project_slug=kwargs.get('project_slug'), subproject_slug=kwargs.get('subproject_slug'), lang_slug=kwargs.get('lang_slug'), version_slug=version_slug, filename=kwargs.get('filename', ''), ) storage_root_path = final_project.get_storage_path( type_='html', version_slug=version_slug, include_file=False, version_type=self.version_type, ) # First, check for dirhtml with slash for tryfile in ('index.html', 'README.html'): storage_filename_path = build_media_storage.join( storage_root_path, f'{filename}/{tryfile}'.lstrip('/'), ) log.debug( 'Trying index filename: project=%s version=%s, file=%s', final_project.slug, version_slug, storage_filename_path, ) if build_media_storage.exists(storage_filename_path): log.info( 'Redirecting to index file: project=%s version=%s, storage_path=%s', final_project.slug, version_slug, storage_filename_path, ) # Use urlparse so that we maintain GET args in our redirect parts = urlparse(proxito_path) if tryfile == 'README.html': new_path = parts.path.rstrip('/') + f'/{tryfile}' else: new_path = parts.path.rstrip('/') + '/' # `proxito_path` doesn't include query params.` query = urlparse(request.get_full_path()).query new_parts = parts._replace( path=new_path, query=query, ) redirect_url = new_parts.geturl() # TODO: decide if we need to check for infinite redirect here # (from URL == to URL) return HttpResponseRedirect(redirect_url) # ``redirect_filename`` is the path without ``/<lang>/<version>`` and # without query, starting with a ``/``. This matches our old logic: # https://github.com/readthedocs/readthedocs.org/blob/4b09c7a0ab45cd894c3373f7f07bad7161e4b223/readthedocs/redirects/utils.py#L60 # We parse ``filename`` to remove the query from it schema, netloc, path, params, query, fragments = urlparse(filename) redirect_filename = path # we can't check for lang and version here to decide if we need to add # the ``/`` or not because ``/install.html`` is a valid path to use as # redirect and does not include lang and version on it. It should be # fine always adding the ``/`` to the beginning. redirect_filename = '/' + redirect_filename.lstrip('/') # Check and perform redirects on 404 handler # NOTE: this redirect check must be done after trying files like # ``index.html`` and ``README.html`` to emulate the behavior we had when # serving directly from NGINX without passing through Python. redirect_path, http_status = self.get_redirect( project=final_project, lang_slug=lang_slug, version_slug=version_slug, filename=redirect_filename, full_path=proxito_path, ) if redirect_path and http_status: try: return self.get_redirect_response(request, redirect_path, proxito_path, http_status) except InfiniteRedirectException: # Continue with our normal 404 handling in this case pass # If that doesn't work, attempt to serve the 404 of the current version (version_slug) # Secondly, try to serve the 404 page for the default version # (project.get_default_version()) doc_type = (Version.objects.filter(project=final_project, slug=version_slug).values_list( 'documentation_type', flat=True).first()) versions = [(version_slug, doc_type)] default_version_slug = final_project.get_default_version() if default_version_slug != version_slug: default_version_doc_type = (Version.objects.filter( project=final_project, slug=default_version_slug).values_list('documentation_type', flat=True).first()) versions.append((default_version_slug, default_version_doc_type)) for version_slug_404, doc_type_404 in versions: if not self.allowed_user(request, final_project, version_slug_404): continue storage_root_path = final_project.get_storage_path( type_='html', version_slug=version_slug_404, include_file=False, version_type=self.version_type, ) tryfiles = ['404.html'] # SPHINX_HTMLDIR is the only builder # that could output a 404/index.html file. if doc_type_404 == SPHINX_HTMLDIR: tryfiles.append('404/index.html') for tryfile in tryfiles: storage_filename_path = build_media_storage.join( storage_root_path, tryfile) if build_media_storage.exists(storage_filename_path): log.info( 'Serving custom 404.html page: [project: %s] [version: %s]', final_project.slug, version_slug_404, ) resp = HttpResponse( build_media_storage.open(storage_filename_path).read()) resp.status_code = 404 return resp raise Http404('No custom 404 page found.')
def _create_intersphinx_data(version, commit, build): """ Create intersphinx data for this version. :param version: Version instance :param commit: Commit that updated path :param build: Build id """ if not version.is_sphinx_type: return html_storage_path = version.project.get_storage_path( type_='html', version_slug=version.slug, include_file=False ) json_storage_path = version.project.get_storage_path( type_='json', version_slug=version.slug, include_file=False ) object_file = build_media_storage.join(html_storage_path, 'objects.inv') if not build_media_storage.exists(object_file): log.debug('No objects.inv, skipping intersphinx indexing.') return type_file = build_media_storage.join(json_storage_path, 'readthedocs-sphinx-domain-names.json') types = {} titles = {} if build_media_storage.exists(type_file): try: data = json.load(build_media_storage.open(type_file)) types = data['types'] titles = data['titles'] except Exception: log.exception('Exception parsing readthedocs-sphinx-domain-names.json') # These classes are copied from Sphinx # https://github.com/sphinx-doc/sphinx/blob/d79d041f4f90818e0b495523fdcc28db12783caf/sphinx/ext/intersphinx.py#L400-L403 # noqa class MockConfig: intersphinx_timeout = None tls_verify = False user_agent = None class MockApp: srcdir = '' config = MockConfig() def warn(self, msg): log.warning('Sphinx MockApp.', msg=msg) # Re-create all objects from the new build of the version object_file_url = build_media_storage.url(object_file) if object_file_url.startswith('/'): # Filesystem backed storage simply prepends MEDIA_URL to the path to get the URL # This can cause an issue if MEDIA_URL is not fully qualified object_file_url = settings.RTD_INTERSPHINX_URL + object_file_url invdata = intersphinx.fetch_inventory(MockApp(), '', object_file_url) for key, value in sorted(invdata.items() or {}): domain, _type = key.split(':', 1) for name, einfo in sorted(value.items()): # project, version, url, display_name # ('Sphinx', '1.7.9', 'faq.html#epub-faq', 'Epub info') try: url = einfo[2] if '#' in url: doc_name, anchor = url.split( '#', # The anchor can contain ``#`` characters maxsplit=1 ) else: doc_name, anchor = url, '' display_name = einfo[3] except Exception: log.exception( 'Error while getting sphinx domain information. Skipping...', project_slug=version.project.slug, version_slug=version.slug, sphinx_domain='{domain}->{name}', ) continue # HACK: This is done because the difference between # ``sphinx.builders.html.StandaloneHTMLBuilder`` # and ``sphinx.builders.dirhtml.DirectoryHTMLBuilder``. # They both have different ways of generating HTML Files, # and therefore the doc_name generated is different. # More info on: http://www.sphinx-doc.org/en/master/usage/builders/index.html#builders # Also see issue: https://github.com/readthedocs/readthedocs.org/issues/5821 if doc_name.endswith('/'): doc_name += 'index.html' html_file = HTMLFile.objects.filter( project=version.project, version=version, path=doc_name, build=build, ).first() if not html_file: log.debug( 'HTMLFile object not found.', project_slug=version.project.slug, version_slug=version.slug, build_id=build, doc_name=doc_name ) # Don't create Sphinx Domain objects # if the HTMLFile object is not found. continue SphinxDomain.objects.create( project=version.project, version=version, html_file=html_file, domain=domain, name=name, display_name=display_name, type=_type, type_display=types.get(f'{domain}:{_type}', ''), doc_name=doc_name, doc_display=titles.get(doc_name, ''), anchor=anchor, commit=commit, build=build, )