def parse(self): doc = lxml_parse(StringIO(self.f.read()), base_url=self.f.geturl()).getroot() if doc is None: return None # Make all relative links absolute doc.make_links_absolute() links = list(doc.cssselect("a")) logging.debug("(TID:%d) %s: %d links" % (self.thread_id, "links_"+self.f.geturl(), len(links))) for link in links: if not link.get("href"): continue scheme, netloc, path, query, fragment = urlsplit(link.get("href")) if scheme.lower() in ["http"]: # Add the link to the queue to be processed URLHandler.add_url(self.db, link.get("href")) # Add the link to the page's link set self.db.sadd("links_"+self.f.geturl(), link.get("href"))
def parse(self): doc = lxml_parse(StringIO(self.f.read()), base_url=self.f.geturl()).getroot() if doc is None: return None # Make all relative links absolute doc.make_links_absolute() links = list(doc.cssselect("a")) logging.debug("(TID:%d) %s: %d links" % (self.thread_id, "links_" + self.f.geturl(), len(links))) for link in links: if not link.get("href"): continue scheme, netloc, path, query, fragment = urlsplit(link.get("href")) if scheme.lower() in ["http"]: # Add the link to the queue to be processed URLHandler.add_url(self.db, link.get("href")) # Add the link to the page's link set self.db.sadd("links_" + self.f.geturl(), link.get("href"))
def grab_content(url, original_description): logging.debug("grabbing the content of %s" % url) try: site = urlopen(url) except Exception as e: logging.debug("Error: can't access %s for grabbing: %s" % (url, e)) return original_description + "\n<p><b>WarGod error</b>: I could not access this url</p>" site_url = "/".join(site.geturl().split("/")[:3]) + "/" path = get_grabber(url) logging.debug("xpath query is %s" % path) try: xml = lxml_parse(site) content = xml.xpath(path)[0] except Unparseable: logging.debug("Error: can't parse %s using %s" % (url, path)) return original_description + "\n<p><b>WarGod error</b>: I could not parse this url</p>" except IndexError: logging.debug("Error: can't get content of %s using %s" % (url, path)) return original_description + "\n<p><b>WarGod error</b>: I could not parse this url</p>" except Exception as e: sys.stderr.write("Unpredicted Error: can't get content of %s using %s" % (url, path)) return original_description + "\n<p><b>WarGod error</b>: I could not parse this url</p>" content.make_links_absolute(site_url) return etree.tostring(content, encoding="Utf-8").decode("Utf-8")
def parse(self, parser=None, base_url=None): """Parses the underlying html source using `lxml` library. This parsed tree is stored in :attr:`root` of this object. which could be used to perform numerous operations. Returns ------- ElementTree """ utx = self._get_utx() assert utx is not None, "UrlTransformer not Implemented." # internal error assert utx.base_path is not None, "Base Path is not set!" assert utx.base_url is not None, "Base url is not Set!" if not isinstance(parser, HTMLParser): TypeError("Expected instance of <%r>, got <%r>" % (HTMLParser, parser)) if not parser: parser = HTMLParser(encoding=self.encoding, collect_ids=False) source = self.get_source() assert source is not None, "Source is not Set!" assert hasattr(source, 'read'), "File like object is required!" # assert self._element_factory is not None # assert hasattr(self._element_factory, 'make_element') LOGGER.info( 'Parsing tree with source: <%r> encoding <%s> and parser <%r>' % (self._source, self.encoding, parser)) context_tree = lxml_parse(source, parser=parser, base_url=base_url) # The tree generated by the parse is stored in the self.root # variable and can be utilised further for any number of use cases self._tree = context_tree self.root = context_tree.getroot() if self.root is not None: # WaterMarking :) self.root.insert( 0, Comment(MARK.format('', __version__, utx.url, utc_now(), ''))) # There are internal links present on the html page which are files # that includes `#` and `javascript:` and 'data:base64;` type links # or a simple `/` url referring anchor tag # thus these links needs to be left as is. factory = getattr(self, 'make_element', None) assert callable(factory), "Element generator is not callable!" # Modify the tree elements for el in context_tree.iter(): # A element can contain multiple urls for pack in self._handle_lxml_elem(el): if pack is not None: elem, attr, url, pos = pack else: # pragma: no cover continue if elem is not None: o = factory(elem, attr, url, pos) if o is not None: self._stack.append(o) self._parseComplete = True return self.root