def code_sample(self, name): """ Extract a dict containing the html, css, and js listings for a given code sample identified by a name. This should be pretty agnostic to markup patterns, since it just requires a parent container with an DID and 3 child elements somewhere within with class names "html", "css", and "js" - and our syntax highlighting already does that with <pre>'s Given the name of a code sample, attempt to extract it from rendered HTML with a fallback to non-rendered in case of errors. """ parts = ("html", "css", "js") data = dict((x, None) for x in parts) try: src, errors = self.document.get_rendered() if errors: src = self.document.html except DocumentRenderedContentNotAvailable: src = self.document.html if not src: return data section = parse(src).extractSection(name).serialize() if section: # HACK: Ensure the extracted section has a container, in case it # consists of a single element. sample = pq("<section>%s</section>" % section) else: # If no section, fall back to plain old ID lookup try: sample = pq(src).find("[id=%s]" % quoteattr(name)) except ValueError: return data selector_templates = ( ".%s", # HACK: syntaxhighlighter (ab)uses the className as a # semicolon-separated options list... 'pre[class*="brush:%s"]', 'pre[class*="%s;"]', ) for part in parts: selector = ",".join( selector_template % part for selector_template in selector_templates ) src = sample.find(selector).text(squash_space=False) if src is not None: # Bug 819999: gets decoded to \xa0, which trips up CSS src = src.replace("\xa0", " ") # Bug 1284781: is incorrectly parsed on embed sample src = src.replace(" ", " ") if src: data[part] = src return data
def test_user_edit_with_subscription_info(mock1, mock2, test_user): """The user has already signed up for a subscription and now the user edit page contains information about that from Stripe.""" mock1.side_effect = mock_get_stripe_customer mock2.side_effect = mock_get_stripe_subscription_info # We need to fake the User.subscriber_number because the # 'get_stripe_subscription_info' is faked so the signals that set it are # never happening in this context. UserSubscription.set_active(test_user, "sub_123456789") # sanity check test_user.refresh_from_db() assert test_user.subscriber_number == 1 client = Client() client.force_login(test_user) response = client.post( reverse("users.user_edit", args=[test_user.username]), HTTP_HOST=settings.WIKI_HOST, ) assert response.status_code == 200 page = pq(response.content) assert page("#subscription h2").text() == "You are MDN member number 1" assert not page(".stripe-error").size() assert "MagicCard ending in 4242" in page(".card-info p").text()
def test_next_subscriber_number_shown_for_non_subscribers(test_user): client = Client() client.force_login(test_user) response = client.get(reverse("users.user_edit", args=[test_user.username])) assert response.status_code == 200 page = pq(response.content) assert "You will be MDN member number 1" in page("#subscription p").text()
def filter_out_noinclude(src): """ Quick and dirty filter to remove <div class="noinclude"> blocks """ # NOTE: This started as an html5lib filter, but it started getting really # complex. Seems like pyquery works well enough without corrupting # character encoding. if not src: return '' doc = pq(src) doc.remove('*[class=noinclude]') return to_html(doc)
def css_classnames(self): """ Extract the unique set of class names used in the content """ if not self.document.rendered_html: # No point parsing it because we won't find anything! return [] classnames = set() for element in pq(self.document.rendered_html).find('*'): css_classes = element.attrib.get('class') if css_classes: classnames.update(css_classes.split(' ')) return list(classnames)
def extract_data(self, html): """Extract user data from profile HTML.""" data = {} parsed = pq(html) username_elem = parsed.find("h1.user-title span.nickname")[0] data['username'] = username_elem.text fullname_elems = parsed.find("h1.user-title span.fn") if fullname_elems: data['fullname'] = fullname_elems[0].text if parsed.find('ul.user-info'): for cls, name in ( ('title', 'title'), ('org', 'organization'), ('loc', 'location'), ('irc', 'irc_nickname')): elem = parsed.find('ul.user-info li.%s' % cls) if elem: if cls == 'irc': raw = elem[0].text data[name] = raw.replace('IRC: ', '') else: data[name] = elem[0].text tags_divs = parsed.find("div.user-tags") for tag_div in tags_divs: h2 = tag_div.find('h2') if 'Interests' in h2.text: tag_type = 'interest' else: assert 'Expertise' in h2.text tag_type = 'expertise' tags = sorted([tag.text for tag in tag_div.cssselect('span')]) data[tag_type] = tags if self.social: socials = ('twitter', 'github', 'stackoverflow', 'linkedin', 'mozillians', 'facebook') for social in socials: cssselect = 'ul.user-links li.%s a' % social social_elem = parsed.find(cssselect) if social_elem: social_href = self.decode_href(social_elem.attr('href')) data['%s_url' % social] = social_href since_elem = parsed.find('div.user-since time') raw_date_joined = since_elem.attr('datetime') date_joined = dateutil.parser.parse(raw_date_joined) data['date_joined'] = date_joined.replace(tzinfo=None) return data
def include_svg(path, title=None, title_id=None): """ Embded an SVG file by path, optionally changing the title, and adding an id """ svg = loader.get_template(path).render() if title: svg_parsed = pq(svg, namespaces={"svg": "http://www.w3.org/2000/svg"}) svg_parsed("svg|title")[0].text = title if title_id: svg_parsed("svg|title").attr["id"] = title_id svg_out = svg_parsed.outerHtml() else: svg_out = svg return jinja2.Markup(svg_out)
def load_prereqs(self, requester, storage): """Request the page and gather document links.""" response = requester.request(self.path) parsed = pq(response.content) options = self.current_options() requirements = [] seen_paths = set() for link in parsed("a"): doc_path = self.doc_path_for_href(link.attrib.get("href", "")) if doc_path and doc_path not in seen_paths: seen_paths.add(doc_path) requirements.append(("document", doc_path, options)) return True, requirements
def test_user_edit_with_subscription_info(mock1, mock2, test_user): """The user has already signed up for a subscription and now the user edit page contains information about that from Stripe.""" mock1.side_effect = mock_get_stripe_customer mock2.side_effect = mock_get_stripe_subscription_info client = Client() client.force_login(test_user) response = client.post( reverse("users.user_edit", args=[test_user.username]), HTTP_HOST=settings.WIKI_HOST, ) assert response.status_code == 200 page = pq(response.content) assert not page(".stripe-error").size() assert "MagicCard ending in 4242" in page(".card-info p").text()
def get_content_sections(src=""): """ Gets sections in a document """ sections = [] if src: attr = "[id]" selector = (attr + ",").join(SECTION_TAGS) + attr try: document = pq(src) except etree.ParserError: pass else: for element in document.find(selector): sections.append({"title": element.text, "id": element.attrib.get("id")}) return sections
def test_create_stripe_subscription(mock1, mock2, test_user): client = Client() client.force_login(test_user) response = client.post( reverse("users.create_stripe_subscription"), data={"stripe_token": "tok_visa", "stripe_email": "*****@*****.**"}, follow=True, HTTP_HOST=settings.WIKI_HOST, ) assert response.status_code == 200 page = pq(response.content) assert page(".stripe-error").size() == 0 assert "MagicCard ending in 4242" in page(".card-info p").text()
def selector_content_find(document, selector): """ Provided a selector, returns the relevant content from the document """ content = "" try: page = pq(document.rendered_html) except ValueError: # pass errors during construction pass try: content = page.find(selector).text() except SelectorSyntaxError: # pass errors during find/select pass return content
def extract_data(self, html): """Extract user data from profile HTML.""" data = {} parsed = pq(html) username_elem = parsed.find("h1.user-title span.nickname")[0] data["username"] = username_elem.text fullname_elems = parsed.find("h1.user-title span.fn") if fullname_elems: data["fullname"] = fullname_elems[0].text if parsed.find("ul.user-info"): for cls, name in ( ("title", "title"), ("org", "organization"), ("loc", "location"), ("irc", "irc_nickname"), ): elem = parsed.find("ul.user-info li.%s" % cls) if elem: if cls == "irc": raw = elem[0].text data[name] = raw.replace("IRC: ", "") else: data[name] = elem[0].text if self.social: socials = ( "twitter", "github", "stackoverflow", "linkedin", "pmo", "facebook", ) for social in socials: cssselect = "ul.user-links li.%s a" % social social_elem = parsed.find(cssselect) if social_elem: social_href = self.decode_href(social_elem.attr("href")) data["%s_url" % social] = social_href since_elem = parsed.find("div.user-since time") raw_date_joined = since_elem.attr("datetime") date_joined = dateutil.parser.parse(raw_date_joined) data["date_joined"] = date_joined.replace(tzinfo=None) return data
def get_content_sections(src=''): """ Gets sections in a document """ sections = [] if src: attr = '[id]' selector = (attr + ',').join(SECTION_TAGS) + attr try: document = pq(src) except etree.ParserError: pass else: for element in document.find(selector): sections.append({'title': element.text, 'id': element.attrib.get('id')}) return sections
def extract_data(self, content): """Convert a history pageview into history data.""" revs = [] parsed = pq(content) # If translation, there may be an entry for the English source en_source = parsed.find('li.revision-list-en-source' ' div.revision-list-date a') if en_source: en_href = self.decode_href(en_source[0].attrib['href']) else: en_href = None for link in parsed.find('div.revision-list-date a'): href = self.decode_href(link.attrib['href']) if href == en_href: revs[-1][-1]['based_on'] = en_href else: revs.append(('revision', href, {})) return revs
def extract_data(self, html): """Extract revision source and metadata from HTML.""" data = {} keys = ('slug', 'title', 'id', 'created', 'creator', 'is_current', 'comment') parsed = pq(html) # Parse revision-info list for key in keys: name = key.replace('_', '-') span = parsed('span[data-name="%s"]' % name) if key == 'id': value = int(span.text()) elif key == 'created': created = span[0].cssselect('time')[0].attrib['datetime'] value = dateutil.parser.parse(created) value = value.replace(tzinfo=None) elif key == 'is_current': value = span.attr['data-value'] == '1' elif key == 'comment': value = span.text() or '' else: value = span.text() data[key] = value # Parse tags tags = [] tag_links = parsed.find('ul.tags li a') for tag_link in tag_links: tags.append(tag_link.text) data['tags'] = tags # Revision content source_elem = parsed.find('div#doc-source pre')[0] data['content'] = source_elem.text return data
def extract_data(self, html): """Extract revision source and metadata from HTML.""" data = {} keys = ("slug", "title", "id", "created", "creator", "is_current", "comment") parsed = pq(html) # Parse revision-info list for key in keys: name = key.replace("_", "-") span = parsed('span[data-name="%s"]' % name) if key == "id": value = int(span.text()) elif key == "created": created = span[0].cssselect("time")[0].attrib["datetime"] value = dateutil.parser.parse(created) value = value.replace(tzinfo=None) elif key == "is_current": value = span.attr["data-value"] == "1" elif key == "comment": value = span.text() or "" else: value = span.text() data[key] = value # Parse tags tags = [] tag_links = parsed.find("ul.tags li a") for tag_link in tag_links: tags.append(tag_link.text) data["tags"] = tags # Revision content source_elem = parsed.find("div#doc-source pre")[0] data["content"] = source_elem.text return data
def get_seo_description(content, locale=None, strip_markup=True): # Create an SEO summary # TODO: Google only takes the first 180 characters, so maybe we find a # logical way to find the end of sentence before 180? seo_summary = '' if content: # Try constraining the search for summary to an explicit "Summary" # section, if any. # This line is ~20x times slower than doing the PyQuery analysis. # Both `parse()` and `.serialize()` are slow and expensive. # That's why we're careful to avoid it if we can. if 'Summary' in content: summary_section = ( parse(content).extractSection('Summary').serialize()) if summary_section: content = summary_section # Need to add a BR to the page content otherwise pyQuery wont find # a <p></p> element if it's the only element in the doc_html. seo_analyze_doc_html = content + '<br />' page = pq(seo_analyze_doc_html) # Look for the SEO summary class first summaryClasses = page.find('.seoSummary') if len(summaryClasses): if strip_markup: seo_summary = summaryClasses.text() else: seo_summary = ''.join( to_html(item) for item in summaryClasses.items()) else: paragraphs = page.find('p') if paragraphs.length: for p in range(len(paragraphs)): item = paragraphs.eq(p) if strip_markup: text = item.text() else: text = to_html(item) # Checking for a parent length of 2 # because we don't want p's wrapped # in DIVs ("<div class='warning'>") and pyQuery adds # "<html><div>" wrapping to entire document text_match = (text and len(text) and 'Redirect' not in text and text.find(u'«') == -1 and text.find('«') == -1 and item.parents().length == 2) if text_match: seo_summary = text.strip() break if strip_markup: # Post-found cleanup # remove markup chars seo_summary = seo_summary.replace('<', '').replace('>', '') # remove spaces around some punctuation added by PyQuery if locale == 'en-US': seo_summary = re.sub(r' ([,\)\.])', r'\1', seo_summary) seo_summary = re.sub(r'(\() ', r'\1', seo_summary) return seo_summary
def is_banned(self, html): """Detect if a 404 is for a banned user.""" parsed = pq(html) ban_text = parsed.find('p.notice') return 'banned' in ban_text.text()