Beispiel #1
0
  def parse(self):
    doc = lxml_parse(StringIO(self.f.read()), base_url=self.f.geturl()).getroot()
    if doc is None:
      return None

    # Make all relative links absolute
    doc.make_links_absolute()

    links = list(doc.cssselect("a"))
    logging.debug("(TID:%d) %s: %d links" % (self.thread_id, "links_"+self.f.geturl(), len(links)))
    for link in links:
      if not link.get("href"):
        continue

      scheme, netloc, path, query, fragment = urlsplit(link.get("href"))
      if scheme.lower() in ["http"]:
        # Add the link to the queue to be processed
        URLHandler.add_url(self.db, link.get("href"))
        # Add the link to the page's link set
        self.db.sadd("links_"+self.f.geturl(), link.get("href"))
Beispiel #2
0
    def parse(self):
        doc = lxml_parse(StringIO(self.f.read()),
                         base_url=self.f.geturl()).getroot()
        if doc is None:
            return None

        # Make all relative links absolute
        doc.make_links_absolute()

        links = list(doc.cssselect("a"))
        logging.debug("(TID:%d) %s: %d links" %
                      (self.thread_id, "links_" + self.f.geturl(), len(links)))
        for link in links:
            if not link.get("href"):
                continue

            scheme, netloc, path, query, fragment = urlsplit(link.get("href"))
            if scheme.lower() in ["http"]:
                # Add the link to the queue to be processed
                URLHandler.add_url(self.db, link.get("href"))
                # Add the link to the page's link set
                self.db.sadd("links_" + self.f.geturl(), link.get("href"))
Beispiel #3
0
def grab_content(url, original_description):
    logging.debug("grabbing the content of %s" % url)
    try:
        site = urlopen(url)
    except Exception as e:
        logging.debug("Error: can't access %s for grabbing: %s" % (url, e))
        return original_description + "\n<p><b>WarGod error</b>: I could not access this url</p>"
    site_url = "/".join(site.geturl().split("/")[:3]) + "/"
    path = get_grabber(url)
    logging.debug("xpath query is %s" % path)
    try:
        xml = lxml_parse(site)
        content = xml.xpath(path)[0]
    except Unparseable:
        logging.debug("Error: can't parse %s using %s" % (url, path))
        return original_description + "\n<p><b>WarGod error</b>: I could not parse this url</p>"
    except IndexError:
        logging.debug("Error: can't get content of %s using %s" % (url, path))
        return original_description + "\n<p><b>WarGod error</b>: I could not parse this url</p>"
    except Exception as e:
        sys.stderr.write("Unpredicted Error: can't get content of %s using %s" % (url, path))
        return original_description + "\n<p><b>WarGod error</b>: I could not parse this url</p>"
    content.make_links_absolute(site_url)
    return etree.tostring(content, encoding="Utf-8").decode("Utf-8")
Beispiel #4
0
    def parse(self, parser=None, base_url=None):
        """Parses the underlying html source using `lxml` library.

        This parsed tree is stored in :attr:`root` of this object.
        which could be used to perform numerous operations.

        Returns
        -------
            ElementTree
        """
        utx = self._get_utx()

        assert utx is not None, "UrlTransformer not Implemented."  # internal error
        assert utx.base_path is not None, "Base Path is not set!"
        assert utx.base_url is not None, "Base url is not Set!"
        if not isinstance(parser, HTMLParser):
            TypeError("Expected instance of <%r>, got <%r>" %
                      (HTMLParser, parser))

        if not parser:
            parser = HTMLParser(encoding=self.encoding, collect_ids=False)

        source = self.get_source()

        assert source is not None, "Source is not Set!"
        assert hasattr(source, 'read'), "File like object is required!"
        # assert self._element_factory is not None
        # assert hasattr(self._element_factory, 'make_element')
        LOGGER.info(
            'Parsing tree with source: <%r> encoding <%s> and parser <%r>' %
            (self._source, self.encoding, parser))

        context_tree = lxml_parse(source, parser=parser, base_url=base_url)
        # The tree generated by the parse is stored in the self.root
        # variable and can be utilised further for any number of use cases
        self._tree = context_tree
        self.root = context_tree.getroot()

        if self.root is not None:
            # WaterMarking :)
            self.root.insert(
                0, Comment(MARK.format('', __version__, utx.url, utc_now(),
                                       '')))

        # There are internal links present on the html page which are files
        # that includes `#` and `javascript:` and 'data:base64;` type links
        # or a simple `/` url referring anchor tag
        # thus these links needs to be left as is.
        factory = getattr(self, 'make_element', None)
        assert callable(factory), "Element generator is not callable!"

        # Modify the tree elements
        for el in context_tree.iter():
            # A element can contain multiple urls
            for pack in self._handle_lxml_elem(el):

                if pack is not None:
                    elem, attr, url, pos = pack
                else:  # pragma: no cover
                    continue

                if elem is not None:
                    o = factory(elem, attr, url, pos)
                    if o is not None:
                        self._stack.append(o)

        self._parseComplete = True
        return self.root