Esempio n. 1
0
    def __iter__(self):
        from wiki.models import Document

        input = html5lib_Filter.__iter__(self)

        # Pass #1: Gather all the link URLs and prepare annotations
        links = dict()
        buffer = []
        for token in input:
            buffer.append(token)
            if "StartTag" == token["type"] and "a" == token["name"]:
                attrs = dict(token["data"])
                if not "href" in attrs:
                    continue

                href = attrs["href"]
                if href.startswith(self.base_url):
                    # Squash site-absolute URLs to site-relative paths.
                    href = "/%s" % href[len(self.base_url) :]

                # Prepare annotations record for this path.
                links[href] = dict(classes=[])

        # Run through all the links and check for annotatable conditions.
        for href in links.keys():

            # Is this an external URL?
            is_external = False
            for prefix in self.EXTERNAL_PREFIXES:
                if href.startswith(prefix):
                    is_external = True
                    break
            if is_external:
                links[href]["classes"].append("external")
                continue

            # TODO: Should this also check for old-school mindtouch URLs? Or
            # should we encourage editors to convert to new-style URLs to take
            # advantage of link annotation? (I'd say the latter)

            # Is this a kuma doc URL?
            if "/docs/" in href:

                # Check if this is a special docs path that's exempt from "new"
                skip = False
                for path in DOC_SPECIAL_PATHS:
                    if "/docs/%s" % path in href:
                        skip = True
                if skip:
                    continue

                href_locale, href_path = href.split(u"/docs/", 1)
                if href_locale.startswith(u"/"):
                    href_locale = href_locale[1:]

                if "#" in href_path:
                    # If present, discard the hash anchor
                    href_path, _, _ = href_path.partition("#")

                # Handle any URL-encoded UTF-8 characters in the path
                href_path = href_path.encode("utf-8", "ignore")
                href_path = urllib.unquote(href_path)
                href_path = href_path.decode("utf-8", "ignore")

                # Try to sort out the locale and slug through some of our
                # redirection logic.
                locale, slug, needs_redirect = Document.locale_and_slug_from_path(href_path, path_locale=href_locale)

                # Does this locale and slug correspond to an existing document?
                # If not, mark it as a "new" link.
                #
                # TODO: Should these DB queries be batched up into one big
                # query? A page with hundreds of links will fire off hundreds
                # of queries
                ct = Document.objects.filter(locale=locale, slug=slug).count()
                if ct == 0:
                    links[href]["classes"].append("new")

        # Pass #2: Filter the content, annotating links
        for token in buffer:
            if "StartTag" == token["type"] and "a" == token["name"]:
                attrs = dict(token["data"])

                if "href" in attrs:

                    href = attrs["href"]
                    if href.startswith(self.base_url):
                        # Squash site-absolute URLs to site-relative paths.
                        href = "/%s" % href[len(self.base_url) :]

                    if href in links:
                        # Update class names on this link element.
                        if "class" in attrs:
                            classes = set(attrs["class"].split(u" "))
                        else:
                            classes = set()
                        classes.update(links[href]["classes"])
                        if classes:
                            attrs["class"] = u" ".join(classes)

                token["data"] = attrs.items()

            yield token
Esempio n. 2
0
    def __iter__(self):
        from wiki.models import Document

        input = html5lib_Filter.__iter__(self)

        # Pass #1: Gather all the link URLs and prepare annotations
        links = dict()
        buffer = []
        for token in input:
            buffer.append(token)
            if ('StartTag' == token['type'] and 'a' == token['name']):
                attrs = dict(token['data'])
                if not 'href' in attrs:
                    continue

                href = attrs['href']
                if href.startswith(self.base_url):
                    # Squash site-absolute URLs to site-relative paths.
                    href = '/%s' % href[len(self.base_url):]

                # Prepare annotations record for this path.
                links[href] = dict(classes=[])

        # Run through all the links and check for annotatable conditions.
        for href in links.keys():

            # Is this an external URL?
            is_external = False
            for prefix in self.EXTERNAL_PREFIXES:
                if href.startswith(prefix):
                    is_external = True
                    break
            if is_external:
                links[href]['classes'].append('external')
                continue

            # TODO: Should this also check for old-school mindtouch URLs? Or
            # should we encourage editors to convert to new-style URLs to take
            # advantage of link annotation? (I'd say the latter)

            # Is this a kuma doc URL?
            if '/docs/' in href:

                # Check if this is a special docs path that's exempt from "new"
                skip = False
                for path in DOC_SPECIAL_PATHS:
                    if '/docs/%s' % path in href:
                        skip = True
                if skip:
                    continue

                href_locale, href_path = href.split(u'/docs/', 1)
                if href_locale.startswith(u'/'):
                    href_locale = href_locale[1:]

                if '#' in href_path:
                    # If present, discard the hash anchor
                    href_path, _, _ = href_path.partition('#')

                # Handle any URL-encoded UTF-8 characters in the path
                href_path = href_path.encode('utf-8', 'ignore')
                href_path = urllib.unquote(href_path)
                href_path = href_path.decode('utf-8', 'ignore')

                # Try to sort out the locale and slug through some of our
                # redirection logic.
                locale, slug, needs_redirect = (
                    Document.locale_and_slug_from_path(
                        href_path, path_locale=href_locale))

                # Does this locale and slug correspond to an existing document?
                # If not, mark it as a "new" link.
                #
                # TODO: Should these DB queries be batched up into one big
                # query? A page with hundreds of links will fire off hundreds
                # of queries
                ct = Document.objects.filter(locale=locale, slug=slug).count()
                if ct == 0:
                    links[href]['classes'].append('new')

        # Pass #2: Filter the content, annotating links
        for token in buffer:
            if ('StartTag' == token['type'] and 'a' == token['name']):
                attrs = dict(token['data'])

                if 'href' in attrs:

                    href = attrs['href']
                    if href.startswith(self.base_url):
                        # Squash site-absolute URLs to site-relative paths.
                        href = '/%s' % href[len(self.base_url):]

                    if href in links:
                        # Update class names on this link element.
                        if 'class' in attrs:
                            classes = set(attrs['class'].split(u' '))
                        else:
                            classes = set()
                        classes.update(links[href]['classes'])
                        if classes:
                            attrs['class'] = u' '.join(classes)

                token['data'] = attrs.items()

            yield token
Esempio n. 3
0
    def __iter__(self):
        from wiki.models import Document

        input = html5lib_Filter.__iter__(self)

        # Pass #1: Gather all the link URLs and prepare annotations
        links = dict()
        buffer = []
        for token in input:
            buffer.append(token)
            if ('StartTag' == token['type'] and 'a' == token['name']):
                attrs = dict(token['data'])
                if not 'href' in attrs:
                    continue

                href = attrs['href']
                if href.startswith(self.base_url):
                    # Squash site-absolute URLs to site-relative paths.
                    href = '/%s' % href[len(self.base_url):]

                # Prepare annotations record for this path.
                links[href] = dict(classes=[])

        needs_existence_check = defaultdict(lambda: defaultdict(set))

        # Run through all the links and check for annotatable conditions.
        for href in links.keys():

            # Is this an external URL?
            is_external = False
            for prefix in self.EXTERNAL_PREFIXES:
                if href.startswith(prefix):
                    is_external = True
                    break
            if is_external:
                links[href]['classes'].append('external')
                continue

            # TODO: Should this also check for old-school mindtouch URLs? Or
            # should we encourage editors to convert to new-style URLs to take
            # advantage of link annotation? (I'd say the latter)

            # Is this a kuma doc URL?
            if '/docs/' in href:

                # Check if this is a special docs path that's exempt from "new"
                skip = False
                for path in DOC_SPECIAL_PATHS:
                    if '/docs/%s' % path in href:
                        skip = True
                if skip:
                    continue

                href_locale, href_path = href.split(u'/docs/', 1)
                if href_locale.startswith(u'/'):
                    href_locale = href_locale[1:]

                if '#' in href_path:
                    # If present, discard the hash anchor
                    href_path, _, _ = href_path.partition('#')

                # Handle any URL-encoded UTF-8 characters in the path
                href_path = href_path.encode('utf-8', 'ignore')
                href_path = urllib.unquote(href_path)
                href_path = href_path.decode('utf-8', 'ignore')

                # Try to sort out the locale and slug through some of our
                # redirection logic.
                locale, slug, needs_redirect = (
                    Document.locale_and_slug_from_path(
                        href_path, path_locale=href_locale))

                # Gather up this link for existence check
                needs_existence_check[locale.lower()][slug.lower()].add(href)

        # Perform existence checks for all the links, using one DB query per
        # locale for all the candidate slugs.
        for locale, slug_hrefs in needs_existence_check.items():

            existing_slugs = (Document.objects.filter(
                locale=locale,
                slug__in=slug_hrefs.keys()).values_list('slug', flat=True))

            # Remove the slugs that pass existence check.
            for slug in existing_slugs:
                lslug = slug.lower()
                if lslug in slug_hrefs:
                    del slug_hrefs[lslug]

            # Mark all the links whose slugs did not come back from the DB
            # query as "new"
            for slug, hrefs in slug_hrefs.items():
                for href in hrefs:
                    links[href]['classes'].append('new')

        # Pass #2: Filter the content, annotating links
        for token in buffer:
            if ('StartTag' == token['type'] and 'a' == token['name']):
                attrs = dict(token['data'])

                if 'href' in attrs:

                    href = attrs['href']
                    if href.startswith(self.base_url):
                        # Squash site-absolute URLs to site-relative paths.
                        href = '/%s' % href[len(self.base_url):]

                    if href in links:
                        # Update class names on this link element.
                        if 'class' in attrs:
                            classes = set(attrs['class'].split(u' '))
                        else:
                            classes = set()
                        classes.update(links[href]['classes'])
                        if classes:
                            attrs['class'] = u' '.join(classes)

                token['data'] = attrs.items()

            yield token