def get_seo_description(content, locale=None, strip_markup=True): # Create an SEO summary # TODO: Google only takes the first 180 characters, so maybe we find a # logical way to find the end of sentence before 180? seo_summary = '' if content: # Try constraining the search for summary to an explicit "Summary" # section, if any. summary_section = (parse(content).extractSection('Summary') .serialize()) if summary_section: content = summary_section # Need to add a BR to the page content otherwise pyQuery wont find # a <p></p> element if it's the only element in the doc_html seo_analyze_doc_html = content + '<br />' page = pq(seo_analyze_doc_html) # Look for the SEO summary class first summaryClasses = page.find('.seoSummary') if len(summaryClasses): if strip_markup: seo_summary = summaryClasses.text() else: seo_summary = ''.join( to_html(item) for item in summaryClasses.items()) else: paragraphs = page.find('p') if paragraphs.length: for p in range(len(paragraphs)): item = paragraphs.eq(p) if strip_markup: text = item.text() else: text = to_html(item) # Checking for a parent length of 2 # because we don't want p's wrapped # in DIVs ("<div class='warning'>") and pyQuery adds # "<html><div>" wrapping to entire document text_match = ( text and len(text) and 'Redirect' not in text and text.find(u'«') == -1 and text.find('«') == -1 and item.parents().length == 2) if text_match: seo_summary = text.strip() break if strip_markup: # Post-found cleanup # remove markup chars seo_summary = seo_summary.replace('<', '').replace('>', '') # remove spaces around some punctuation added by PyQuery if locale == 'en-US': seo_summary = re.sub(r' ([,\)\.])', r'\1', seo_summary) seo_summary = re.sub(r'(\() ', r'\1', seo_summary) return seo_summary
def test_xss_file_attachment_title(admin_client, constance_config, root_doc, wiki_user, editor_client): constance_config.WIKI_ATTACHMENT_ALLOWED_TYPES = 'text/plain' # use view to create new attachment file_for_upload = make_test_file() files_url = reverse('attachments.edit_attachment', kwargs={'document_path': root_doc.slug}) title = '"><img src=x onerror=prompt(navigator.userAgent);>' post_data = { 'title': title, 'description': 'xss', 'comment': 'xss', 'file': file_for_upload, } response = admin_client.post(files_url, data=post_data) assert response.status_code == 302 # now stick it in/on a document attachment = Attachment.objects.get(title=title) content = '<img src="%s" />' % attachment.get_file_url() root_doc.current_revision = Revision.objects.create( document=root_doc, creator=wiki_user, content=content) # view it and verify markup is escaped response = editor_client.get(root_doc.get_edit_url()) assert response.status_code == 200 doc = pq(response.content) text = doc('.page-attachments-table .attachment-name-cell').text() assert text == ('%s\nxss' % title) html = to_html(doc('.page-attachments-table .attachment-name-cell')) assert '><img src=x onerror=prompt(navigator.userAgent);>' in html # security bug 1272791 for script in doc('script'): assert title not in script.text_content()
def test_revisions_locale_filter(dashboard_revisions, user_client): """Revisions can be filtered by locale.""" url = urlparams(reverse('dashboards.revisions', locale='fr'), locale='fr') response = user_client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest') assert response.status_code == 200 page = pq(response.content) revisions = page.find('.dashboard-row') assert revisions.length == 1 locale = to_html(revisions.find('.locale')) assert locale == 'fr'
def test_topic_filter(self): url = urlparams(reverse('dashboards.revisions', locale='en-US'), topic='article-with-revisions') response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest') eq_(response.status_code, 200) page = pq(response.content) revisions = page.find('.dashboard-row') eq_(revisions.length, 7) for revision in revisions: ok_('lorem' not in to_html(pq(revision).find('.dashboard-title')))
def filter_out_noinclude(src): """ Quick and dirty filter to remove <div class="noinclude"> blocks """ # NOTE: This started as an html5lib filter, but it started getting really # complex. Seems like pyquery works well enough without corrupting # character encoding. if not src: return '' doc = pq(src) doc.remove('*[class=noinclude]') return to_html(doc)
def test_revisions_locale_filter(dashboard_revisions, user_client): """Revisions can be filtered by locale.""" url = urlparams(reverse("dashboards.revisions", locale="fr"), locale="fr") response = user_client.get(url, HTTP_HOST=settings.WIKI_HOST, HTTP_X_REQUESTED_WITH="XMLHttpRequest") assert response.status_code == 200 page = pq(response.content) revisions = page.find(".dashboard-row") assert revisions.length == 1 locale = to_html(revisions.find(".locale")) assert locale == "fr"
def test_locale_filter(self): url = urlparams(reverse('dashboards.revisions', locale='fr'), locale='fr') response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest') eq_(200, response.status_code) page = pq(response.content) revisions = page.find('.dashboard-row') ok_(len(revisions)) eq_(1, revisions.length) ok_('fr' in to_html(pq(revisions[0]).find('.locale')))
def test_known_authors_lookup(self): # Only testuser01 is in the Known Authors group url = urlparams(reverse('dashboards.revisions', locale='en-US'), authors=RevisionDashboardForm.KNOWN_AUTHORS) response = self.client.get(url, HTTP_X_REQUESTED_WITH='XMLHttpRequest') eq_(200, response.status_code) page = pq(response.content) revisions = page.find('.dashboard-row') for revision in revisions: author = to_html(pq(revision).find('.dashboard-author')) ok_('testuser01' in author) ok_('testuser2' not in author)
def test_edit_attachment_post_with_vacant_file(admin_client, root_doc, tmpdir, mode): post_data = { 'title': 'Test uploaded file', 'description': 'A test file uploaded into kuma.', 'comment': 'Initial upload', } if mode == 'empty-file': empty_file = tmpdir.join('empty') empty_file.write('') post_data['file'] = empty_file expected = 'The submitted file is empty.' else: expected = 'This field is required.' url = reverse('attachments.edit_attachment', kwargs={'document_path': root_doc.slug}) response = admin_client.post(url, data=post_data) assert response.status_code == 200 doc = pq(response.content) assert to_html(doc('ul.errorlist a[href="#id_file"]')) == expected
def test_xss_file_attachment_title(admin_client, constance_config, root_doc, wiki_user, editor_client, settings): constance_config.WIKI_ATTACHMENT_ALLOWED_TYPES = "text/plain" # use view to create new attachment file_for_upload = make_test_file() files_url = reverse("attachments.edit_attachment", kwargs={"document_path": root_doc.slug}) title = '"><img src=x onerror=prompt(navigator.userAgent);>' post_data = { "title": title, "description": "xss", "comment": "xss", "file": file_for_upload, } response = admin_client.post(files_url, data=post_data, HTTP_HOST=settings.WIKI_HOST) assert response.status_code == 302 # now stick it in/on a document attachment = Attachment.objects.get(title=title) content = '<img src="%s" />' % attachment.get_file_url() root_doc.current_revision = Revision.objects.create(document=root_doc, creator=wiki_user, content=content) # view it and verify markup is escaped response = editor_client.get(root_doc.get_edit_url(), HTTP_HOST=settings.WIKI_HOST) assert response.status_code == 200 doc = pq(response.content) text = doc(".page-attachments-table .attachment-name-cell").text() assert text == ("%s\nxss" % title) html = to_html(doc(".page-attachments-table .attachment-name-cell")) assert "><img src=x onerror=prompt(navigator.userAgent);>" in html # security bug 1272791 for script in doc("script"): assert title not in script.text_content()
def test_edit_attachment_post_with_vacant_file(admin_client, root_doc, tmpdir, mode): post_data = { "title": "Test uploaded file", "description": "A test file uploaded into kuma.", "comment": "Initial upload", } if mode == "empty-file": empty_file = tmpdir.join("empty") empty_file.write("") post_data["file"] = empty_file expected = "The submitted file is empty." else: expected = "This field is required." url = reverse("attachments.edit_attachment", kwargs={"document_path": root_doc.slug}) response = admin_client.post(url, data=post_data, HTTP_HOST=settings.WIKI_HOST) assert response.status_code == 200 doc = pq(response.content) assert to_html(doc('ul.errorlist a[href="#id_file"]')) == expected
def test_xss_file_attachment_title(admin_client, constance_config, root_doc, wiki_user, editor_client): constance_config.WIKI_ATTACHMENT_ALLOWED_TYPES = 'text/plain' # use view to create new attachment file_for_upload = make_test_file() files_url = reverse('attachments.edit_attachment', kwargs={'document_path': root_doc.slug}, locale='en-US') title = '"><img src=x onerror=prompt(navigator.userAgent);>' post_data = { 'title': title, 'description': 'xss', 'comment': 'xss', 'file': file_for_upload, } response = admin_client.post(files_url, data=post_data) assert response.status_code == 302 # now stick it in/on a document attachment = Attachment.objects.get(title=title) content = '<img src="%s" />' % attachment.get_file_url() root_doc.current_revision = Revision.objects.create( document=root_doc, creator=wiki_user, content=content) # view it and verify markup is escaped response = editor_client.get(root_doc.get_edit_url()) assert response.status_code == 200 doc = pq(response.content) text = doc('.page-attachments-table .attachment-name-cell').text() assert text == ('%s\nxss' % title) html = to_html(doc('.page-attachments-table .attachment-name-cell')) assert '><img src=x onerror=prompt(navigator.userAgent);>' in html # security bug 1272791 for script in doc('script'): assert title not in script.text_content()
def _document_api_PUT(request, document_slug, document_locale): """ Handle PUT requests for the document_api view. """ # Try parsing one of the supported content types from the request try: content_type = request.META.get('CONTENT_TYPE', '') if content_type.startswith('application/json'): data = json.loads(request.body) elif content_type.startswith('multipart/form-data'): parser = MultiPartParser(request.META, StringIO(request.body), request.upload_handlers, request.encoding) data, files = parser.parse() elif content_type.startswith('text/html'): # TODO: Refactor this into wiki.content ? # First pass: Just assume the request body is an HTML fragment. html = request.body data = dict(content=html) # Second pass: Try parsing the body as a fuller HTML document, # and scrape out some of the interesting parts. try: doc = pq(html) head_title = doc.find('head title') if head_title.length > 0: data['title'] = head_title.text() body_content = doc.find('body') if body_content.length > 0: data['content'] = to_html(body_content) except Exception: pass else: resp = HttpResponse() resp.status_code = 400 resp.content = ugettext( "Unsupported content-type: %s") % content_type return resp except Exception as e: resp = HttpResponse() resp.status_code = 400 resp.content = ugettext("Request parsing error: %s") % e return resp try: # Look for existing document to edit: doc = Document.objects.get(locale=document_locale, slug=document_slug) section_id = request.GET.get('section', None) is_new = False # Use ETags to detect mid-air edit collision # see: http://www.w3.org/1999/04/Editing/ if_match = request.META.get('HTTP_IF_MATCH') if if_match: try: expected_etags = parse_etags(if_match) except ValueError: expected_etags = [] # Django's parse_etags returns a list of quoted rather than # un-quoted ETags starting with version 1.11. current_etag = quote_etag(calculate_etag(doc.get_html(section_id))) if current_etag not in expected_etags: resp = HttpResponse() resp.status_code = 412 resp.content = ugettext('ETag precondition failed') return resp except Document.DoesNotExist: # TODO: There should be a model utility for creating a doc... # Let's see if this slug path implies a parent... slug_parts = split_slug(document_slug) if not slug_parts['parent']: # Apparently, this is a root page! parent_doc = None else: # There's a parent implied, so make sure we can find it. parent_doc = get_object_or_404(Document, locale=document_locale, slug=slug_parts['parent']) # Create and save the new document; we'll revise it immediately. doc = Document(slug=document_slug, locale=document_locale, title=data.get('title', document_slug), parent_topic=parent_doc) doc.save() section_id = None # No section editing for new document! is_new = True new_rev = doc.revise(request.user, data, section_id) doc.schedule_rendering('max-age=0') request.authkey.log('created' if is_new else 'updated', new_rev, data.get('summary', None)) resp = HttpResponse() if is_new: resp['Location'] = request.build_absolute_uri(doc.get_absolute_url()) resp.status_code = 201 else: resp.status_code = 205 return resp
def get_seo_description(content, locale=None, strip_markup=True): # Create an SEO summary # TODO: Google only takes the first 180 characters, so maybe we find a # logical way to find the end of sentence before 180? seo_summary = '' if content: # Try constraining the search for summary to an explicit "Summary" # section, if any. # This line is ~20x times slower than doing the PyQuery analysis. # Both `parse()` and `.serialize()` are slow and expensive. # That's why we're careful to avoid it if we can. if 'Summary' in content: summary_section = (parse(content).extractSection('Summary') .serialize()) if summary_section: content = summary_section # Need to add a BR to the page content otherwise pyQuery wont find # a <p></p> element if it's the only element in the doc_html. # Note, PyQuery is magically clumsy in that it will try a download # if the first and only argument looks like a URL. # It does that by looking for args[0] being a string and # containing 'http://' or 'https://'. # Adding an empty space, no matter what the content is will fool # PyQuery. seo_analyze_doc_html = ' ' + content + '<br />' page = pq(seo_analyze_doc_html) # Look for the SEO summary class first summaryClasses = page.find('.seoSummary') if len(summaryClasses): if strip_markup: seo_summary = summaryClasses.text() else: seo_summary = ''.join( to_html(item) for item in summaryClasses.items()) else: paragraphs = page.find('p') if paragraphs.length: for p in range(len(paragraphs)): item = paragraphs.eq(p) if strip_markup: text = item.text() else: text = to_html(item) # Checking for a parent length of 2 # because we don't want p's wrapped # in DIVs ("<div class='warning'>") and pyQuery adds # "<html><div>" wrapping to entire document text_match = ( text and len(text) and 'Redirect' not in text and text.find(u'«') == -1 and text.find('«') == -1 and item.parents().length == 2) if text_match: seo_summary = text.strip() break if strip_markup: # Post-found cleanup # remove markup chars seo_summary = seo_summary.replace('<', '').replace('>', '') # remove spaces around some punctuation added by PyQuery if locale == 'en-US': seo_summary = re.sub(r' ([,\)\.])', r'\1', seo_summary) seo_summary = re.sub(r'(\() ', r'\1', seo_summary) return seo_summary
def get_seo_description(content, locale=None, strip_markup=True): # Create an SEO summary # TODO: Google only takes the first 180 characters, so maybe we find a # logical way to find the end of sentence before 180? seo_summary = "" if content: # Try constraining the search for summary to an explicit "Summary" # section, if any. # This line is ~20x times slower than doing the PyQuery analysis. # Both `parse()` and `.serialize()` are slow and expensive. # That's why we're careful to avoid it if we can. if "Summary" in content: summary_section = parse(content).extractSection("Summary").serialize() if summary_section: content = summary_section # Need to add a BR to the page content otherwise pyQuery wont find # a <p></p> element if it's the only element in the doc_html. seo_analyze_doc_html = content + "<br />" page = pq(seo_analyze_doc_html) # Look for the SEO summary class first summaryClasses = page.find(".seoSummary") if len(summaryClasses): if strip_markup: seo_summary = summaryClasses.text() else: seo_summary = "".join( to_html(item) or "" for item in summaryClasses.items() ) else: paragraphs = page.find("p") if paragraphs.length: for p in range(len(paragraphs)): item = paragraphs.eq(p) if strip_markup: text = item.text() else: text = to_html(item) # Checking for a parent length of 2 # because we don't want p's wrapped # in DIVs ("<div class='warning'>") and pyQuery adds # "<html><div>" wrapping to entire document text_match = ( text and len(text) and "Redirect" not in text and text.find("«") == -1 and text.find("«") == -1 and item.parents().length == 2 ) if text_match: seo_summary = text.strip() break if strip_markup: # Post-found cleanup # remove markup chars seo_summary = seo_summary.replace("<", "").replace(">", "") # remove spaces around some punctuation added by PyQuery if locale == "en-US": seo_summary = re.sub(r" ([,\)\.])", r"\1", seo_summary) seo_summary = re.sub(r"(\() ", r"\1", seo_summary) return seo_summary