Ejemplo n.º 1
0
    def read_tag(self, element, prev_url):
        """
        This method get all urls from <a> tags and all user readable text,
        from DOM tree

        Args:
            element (DOM element): can get from DOM tree
                with getElementsByTagName
            prev_url (str): url of current page, need for transforming
                relative urls to absolute_
        """

        if element.nodeType == element.TEXT_NODE:
            return (None, element.data.strip())
        elif (element.nodeType == element.ELEMENT_NODE and
              element.tagName not in UNACCEPTABLE_HTML_TAGS):
            if element.tagName == "a":
                if element.hasAttribute("href"):
                    norm = normalize_url(element.getAttribute("href"),
                                         prev_url)
                    if norm:
                        return ([norm], None)
                    else:
                        return ([], None)

                else:
                    return (None, None)
            elif element.hasChildNodes():
                result_text = ""
                urls = []
                for child in element.childNodes:
                    links, text = self.read_tag(child, prev_url)
                    if text:
                        result_text += " " + text

                    if links:
                        for i in links:
                            urls.append(i)

                return (urls, result_text)

            else:
                return (None, None)

        else:
            return (None, None)
Ejemplo n.º 2
0
 def add_url(self, url, referer=None):
     norm_url = normalize_url(url, referer)
     if norm_url:
         self.passed.add(hash(url))
         self.queue.put((url, referer))