Exemple #1
0
    def code_sample(self, name):
        """
        Extract a dict containing the html, css, and js listings for a given
        code sample identified by a name.

        This should be pretty agnostic to markup patterns, since it just
        requires a parent container with an DID and 3 child elements somewhere
        within with class names "html", "css", and "js" - and our syntax
        highlighting already does that with <pre>'s

        Given the name of a code sample, attempt to extract it from rendered
        HTML with a fallback to non-rendered in case of errors.
        """
        parts = ("html", "css", "js")
        data = dict((x, None) for x in parts)

        try:
            src, errors = self.document.get_rendered()
            if errors:
                src = self.document.html
        except DocumentRenderedContentNotAvailable:
            src = self.document.html

        if not src:
            return data

        section = parse(src).extractSection(name).serialize()
        if section:
            # HACK: Ensure the extracted section has a container, in case it
            # consists of a single element.
            sample = pq("<section>%s</section>" % section)
        else:
            # If no section, fall back to plain old ID lookup
            try:
                sample = pq(src).find("[id=%s]" % quoteattr(name))
            except ValueError:
                return data

        selector_templates = (
            ".%s",
            # HACK: syntaxhighlighter (ab)uses the className as a
            # semicolon-separated options list...
            'pre[class*="brush:%s"]',
            'pre[class*="%s;"]',
        )
        for part in parts:
            selector = ",".join(
                selector_template % part for selector_template in selector_templates
            )
            src = sample.find(selector).text(squash_space=False)
            if src is not None:
                # Bug 819999: &nbsp; gets decoded to \xa0, which trips up CSS
                src = src.replace("\xa0", " ")
                # Bug 1284781: &nbsp; is incorrectly parsed on embed sample
                src = src.replace("&nbsp;", " ")
            if src:
                data[part] = src

        return data
def test_user_edit_with_subscription_info(mock1, mock2, test_user):
    """The user has already signed up for a subscription and now the user edit
    page contains information about that from Stripe."""
    mock1.side_effect = mock_get_stripe_customer
    mock2.side_effect = mock_get_stripe_subscription_info

    # We need to fake the User.subscriber_number because the
    # 'get_stripe_subscription_info' is faked so the signals that set it are
    # never happening in this context.
    UserSubscription.set_active(test_user, "sub_123456789")
    # sanity check
    test_user.refresh_from_db()
    assert test_user.subscriber_number == 1

    client = Client()
    client.force_login(test_user)
    response = client.post(
        reverse("users.user_edit", args=[test_user.username]),
        HTTP_HOST=settings.WIKI_HOST,
    )
    assert response.status_code == 200
    page = pq(response.content)
    assert page("#subscription h2").text() == "You are MDN member number 1"
    assert not page(".stripe-error").size()
    assert "MagicCard ending in 4242" in page(".card-info p").text()
def test_next_subscriber_number_shown_for_non_subscribers(test_user):
    client = Client()
    client.force_login(test_user)
    response = client.get(reverse("users.user_edit",
                                  args=[test_user.username]))
    assert response.status_code == 200
    page = pq(response.content)
    assert "You will be MDN member number 1" in page("#subscription p").text()
Exemple #4
0
def filter_out_noinclude(src):
    """
    Quick and dirty filter to remove <div class="noinclude"> blocks
    """
    # NOTE: This started as an html5lib filter, but it started getting really
    # complex. Seems like pyquery works well enough without corrupting
    # character encoding.
    if not src:
        return ''
    doc = pq(src)
    doc.remove('*[class=noinclude]')
    return to_html(doc)
Exemple #5
0
 def css_classnames(self):
     """
     Extract the unique set of class names used in the content
     """
     if not self.document.rendered_html:
         # No point parsing it because we won't find anything!
         return []
     classnames = set()
     for element in pq(self.document.rendered_html).find('*'):
         css_classes = element.attrib.get('class')
         if css_classes:
             classnames.update(css_classes.split(' '))
     return list(classnames)
Exemple #6
0
    def extract_data(self, html):
        """Extract user data from profile HTML."""
        data = {}
        parsed = pq(html)
        username_elem = parsed.find("h1.user-title span.nickname")[0]
        data['username'] = username_elem.text
        fullname_elems = parsed.find("h1.user-title span.fn")
        if fullname_elems:
            data['fullname'] = fullname_elems[0].text

        if parsed.find('ul.user-info'):
            for cls, name in (
                    ('title', 'title'),
                    ('org', 'organization'),
                    ('loc', 'location'),
                    ('irc', 'irc_nickname')):
                elem = parsed.find('ul.user-info li.%s' % cls)
                if elem:
                    if cls == 'irc':
                        raw = elem[0].text
                        data[name] = raw.replace('IRC: ', '')
                    else:
                        data[name] = elem[0].text

        tags_divs = parsed.find("div.user-tags")
        for tag_div in tags_divs:
            h2 = tag_div.find('h2')
            if 'Interests' in h2.text:
                tag_type = 'interest'
            else:
                assert 'Expertise' in h2.text
                tag_type = 'expertise'
            tags = sorted([tag.text for tag in tag_div.cssselect('span')])
            data[tag_type] = tags

        if self.social:
            socials = ('twitter', 'github', 'stackoverflow', 'linkedin',
                       'mozillians', 'facebook')
            for social in socials:
                cssselect = 'ul.user-links li.%s a' % social
                social_elem = parsed.find(cssselect)
                if social_elem:
                    social_href = self.decode_href(social_elem.attr('href'))
                    data['%s_url' % social] = social_href

        since_elem = parsed.find('div.user-since time')
        raw_date_joined = since_elem.attr('datetime')
        date_joined = dateutil.parser.parse(raw_date_joined)
        data['date_joined'] = date_joined.replace(tzinfo=None)

        return data
Exemple #7
0
def include_svg(path, title=None, title_id=None):
    """
    Embded an SVG file by path, optionally changing the title,
    and adding an id
    """
    svg = loader.get_template(path).render()
    if title:
        svg_parsed = pq(svg, namespaces={"svg": "http://www.w3.org/2000/svg"})
        svg_parsed("svg|title")[0].text = title
        if title_id:
            svg_parsed("svg|title").attr["id"] = title_id
        svg_out = svg_parsed.outerHtml()
    else:
        svg_out = svg
    return jinja2.Markup(svg_out)
Exemple #8
0
    def load_prereqs(self, requester, storage):
        """Request the page and gather document links."""
        response = requester.request(self.path)
        parsed = pq(response.content)
        options = self.current_options()
        requirements = []
        seen_paths = set()

        for link in parsed("a"):
            doc_path = self.doc_path_for_href(link.attrib.get("href", ""))
            if doc_path and doc_path not in seen_paths:
                seen_paths.add(doc_path)
                requirements.append(("document", doc_path, options))

        return True, requirements
Exemple #9
0
def test_user_edit_with_subscription_info(mock1, mock2, test_user):
    """The user has already signed up for a subscription and now the user edit
    page contains information about that from Stripe."""
    mock1.side_effect = mock_get_stripe_customer
    mock2.side_effect = mock_get_stripe_subscription_info
    client = Client()
    client.force_login(test_user)
    response = client.post(
        reverse("users.user_edit", args=[test_user.username]),
        HTTP_HOST=settings.WIKI_HOST,
    )
    assert response.status_code == 200
    page = pq(response.content)
    assert not page(".stripe-error").size()
    assert "MagicCard ending in 4242" in page(".card-info p").text()
Exemple #10
0
def get_content_sections(src=""):
    """
    Gets sections in a document
    """
    sections = []
    if src:
        attr = "[id]"
        selector = (attr + ",").join(SECTION_TAGS) + attr
        try:
            document = pq(src)
        except etree.ParserError:
            pass
        else:
            for element in document.find(selector):
                sections.append({"title": element.text, "id": element.attrib.get("id")})
    return sections
Exemple #11
0
def test_create_stripe_subscription(mock1, mock2, test_user):
    client = Client()
    client.force_login(test_user)

    response = client.post(
        reverse("users.create_stripe_subscription"),
        data={"stripe_token": "tok_visa", "stripe_email": "*****@*****.**"},
        follow=True,
        HTTP_HOST=settings.WIKI_HOST,
    )

    assert response.status_code == 200

    page = pq(response.content)
    assert page(".stripe-error").size() == 0
    assert "MagicCard ending in 4242" in page(".card-info p").text()
Exemple #12
0
def selector_content_find(document, selector):
    """
    Provided a selector, returns the relevant content from the document
    """
    content = ""
    try:
        page = pq(document.rendered_html)
    except ValueError:
        # pass errors during construction
        pass
    try:
        content = page.find(selector).text()
    except SelectorSyntaxError:
        # pass errors during find/select
        pass
    return content
Exemple #13
0
    def extract_data(self, html):
        """Extract user data from profile HTML."""
        data = {}
        parsed = pq(html)
        username_elem = parsed.find("h1.user-title span.nickname")[0]
        data["username"] = username_elem.text
        fullname_elems = parsed.find("h1.user-title span.fn")
        if fullname_elems:
            data["fullname"] = fullname_elems[0].text

        if parsed.find("ul.user-info"):
            for cls, name in (
                ("title", "title"),
                ("org", "organization"),
                ("loc", "location"),
                ("irc", "irc_nickname"),
            ):
                elem = parsed.find("ul.user-info li.%s" % cls)
                if elem:
                    if cls == "irc":
                        raw = elem[0].text
                        data[name] = raw.replace("IRC: ", "")
                    else:
                        data[name] = elem[0].text

        if self.social:
            socials = (
                "twitter",
                "github",
                "stackoverflow",
                "linkedin",
                "pmo",
                "facebook",
            )
            for social in socials:
                cssselect = "ul.user-links li.%s a" % social
                social_elem = parsed.find(cssselect)
                if social_elem:
                    social_href = self.decode_href(social_elem.attr("href"))
                    data["%s_url" % social] = social_href

        since_elem = parsed.find("div.user-since time")
        raw_date_joined = since_elem.attr("datetime")
        date_joined = dateutil.parser.parse(raw_date_joined)
        data["date_joined"] = date_joined.replace(tzinfo=None)

        return data
Exemple #14
0
def get_content_sections(src=''):
    """
    Gets sections in a document
    """
    sections = []
    if src:
        attr = '[id]'
        selector = (attr + ',').join(SECTION_TAGS) + attr
        try:
            document = pq(src)
        except etree.ParserError:
            pass
        else:
            for element in document.find(selector):
                sections.append({'title': element.text,
                                 'id': element.attrib.get('id')})
    return sections
Exemple #15
0
    def extract_data(self, content):
        """Convert a history pageview into history data."""
        revs = []
        parsed = pq(content)

        # If translation, there may be an entry for the English source
        en_source = parsed.find('li.revision-list-en-source'
                                ' div.revision-list-date a')
        if en_source:
            en_href = self.decode_href(en_source[0].attrib['href'])
        else:
            en_href = None

        for link in parsed.find('div.revision-list-date a'):
            href = self.decode_href(link.attrib['href'])
            if href == en_href:
                revs[-1][-1]['based_on'] = en_href
            else:
                revs.append(('revision', href, {}))
        return revs
Exemple #16
0
    def extract_data(self, html):
        """Extract revision source and metadata from HTML."""
        data = {}
        keys = ('slug', 'title', 'id', 'created', 'creator', 'is_current',
                'comment')
        parsed = pq(html)

        # Parse revision-info list
        for key in keys:
            name = key.replace('_', '-')
            span = parsed('span[data-name="%s"]' % name)
            if key == 'id':
                value = int(span.text())
            elif key == 'created':
                created = span[0].cssselect('time')[0].attrib['datetime']
                value = dateutil.parser.parse(created)
                value = value.replace(tzinfo=None)
            elif key == 'is_current':
                value = span.attr['data-value'] == '1'
            elif key == 'comment':
                value = span.text() or ''
            else:
                value = span.text()
            data[key] = value

        # Parse tags
        tags = []
        tag_links = parsed.find('ul.tags li a')
        for tag_link in tag_links:
            tags.append(tag_link.text)
        data['tags'] = tags

        # Revision content
        source_elem = parsed.find('div#doc-source pre')[0]
        data['content'] = source_elem.text

        return data
Exemple #17
0
    def extract_data(self, html):
        """Extract revision source and metadata from HTML."""
        data = {}
        keys = ("slug", "title", "id", "created", "creator", "is_current",
                "comment")
        parsed = pq(html)

        # Parse revision-info list
        for key in keys:
            name = key.replace("_", "-")
            span = parsed('span[data-name="%s"]' % name)
            if key == "id":
                value = int(span.text())
            elif key == "created":
                created = span[0].cssselect("time")[0].attrib["datetime"]
                value = dateutil.parser.parse(created)
                value = value.replace(tzinfo=None)
            elif key == "is_current":
                value = span.attr["data-value"] == "1"
            elif key == "comment":
                value = span.text() or ""
            else:
                value = span.text()
            data[key] = value

        # Parse tags
        tags = []
        tag_links = parsed.find("ul.tags li a")
        for tag_link in tag_links:
            tags.append(tag_link.text)
        data["tags"] = tags

        # Revision content
        source_elem = parsed.find("div#doc-source pre")[0]
        data["content"] = source_elem.text

        return data
Exemple #18
0
def get_seo_description(content, locale=None, strip_markup=True):
    # Create an SEO summary
    # TODO:  Google only takes the first 180 characters, so maybe we find a
    #        logical way to find the end of sentence before 180?
    seo_summary = ''
    if content:
        # Try constraining the search for summary to an explicit "Summary"
        # section, if any.
        # This line is ~20x times slower than doing the PyQuery analysis.
        # Both `parse()` and `.serialize()` are slow and expensive.
        # That's why we're careful to avoid it if we can.
        if 'Summary' in content:
            summary_section = (
                parse(content).extractSection('Summary').serialize())
            if summary_section:
                content = summary_section

        # Need to add a BR to the page content otherwise pyQuery wont find
        # a <p></p> element if it's the only element in the doc_html.
        seo_analyze_doc_html = content + '<br />'
        page = pq(seo_analyze_doc_html)

        # Look for the SEO summary class first
        summaryClasses = page.find('.seoSummary')
        if len(summaryClasses):
            if strip_markup:
                seo_summary = summaryClasses.text()
            else:
                seo_summary = ''.join(
                    to_html(item) for item in summaryClasses.items())
        else:
            paragraphs = page.find('p')
            if paragraphs.length:
                for p in range(len(paragraphs)):
                    item = paragraphs.eq(p)
                    if strip_markup:
                        text = item.text()
                    else:
                        text = to_html(item)
                    # Checking for a parent length of 2
                    # because we don't want p's wrapped
                    # in DIVs ("<div class='warning'>") and pyQuery adds
                    # "<html><div>" wrapping to entire document
                    text_match = (text and len(text) and 'Redirect' not in text
                                  and text.find(u'«') == -1
                                  and text.find('&laquo') == -1
                                  and item.parents().length == 2)
                    if text_match:
                        seo_summary = text.strip()
                        break

    if strip_markup:
        # Post-found cleanup
        # remove markup chars
        seo_summary = seo_summary.replace('<', '').replace('>', '')
        # remove spaces around some punctuation added by PyQuery
        if locale == 'en-US':
            seo_summary = re.sub(r' ([,\)\.])', r'\1', seo_summary)
            seo_summary = re.sub(r'(\() ', r'\1', seo_summary)

    return seo_summary
Exemple #19
0
 def is_banned(self, html):
     """Detect if a 404 is for a banned user."""
     parsed = pq(html)
     ban_text = parsed.find('p.notice')
     return 'banned' in ban_text.text()