def extract(self, source, url): try: tree = etree.fromstring( source, etree.XMLParser(recover=True, ns_clean=True)) except: self.logger.error('Could not parse HTML %s' % traceback.format_exc()) return [] links = set() iterator = tree.getiterator() for element in iterator: #(<Element a at 40f7ee0c>, 'href', '../acupuncture8.shtml', 0) #link = link[0] href = element.get('href') if not href or self.expr.match(href): continue href = urljoin(url, href) try: links.add(urlquote(href)) except: # link şema dandik self.logger.error('XHtmlExtractor %s ' % traceback.format_exc()) pass return list(links)
def extract(self, source, url): try: tree = etree.fromstring(source, etree.XMLParser(recover=True, ns_clean=True)) except: self.logger.error('Could not parse HTML %s' % traceback.format_exc()) return [] links = set() iterator = tree.getiterator() for element in iterator: #(<Element a at 40f7ee0c>, 'href', '../acupuncture8.shtml', 0) #link = link[0] href = element.get('href') if not href or self.expr.match(href): continue href = urljoin(url, href) try: links.add(urlquote(href)) except: # link şema dandik self.logger.error('XHtmlExtractor %s ' % traceback.format_exc()) pass return list(links)
def extract(self, source, url): try: tree = html.fromstring(source) tree.rewrite_links(self.ignore) tree.make_links_absolute(url, resolve_base_href=True) except: self.logger.error('Could not parse HTML %s' % traceback.format_exc()) return Set([]) uris = Set([]) iterator = tree.getiterator() for link in iterator: #(<Element a at 40f7ee0c>, 'href', '../acupuncture8.shtml', 0) keywords = LINK.get(link.tag) if not keywords: continue # img, (src, lowsrc..) for word in keywords: uri = link.get(word) if not uri: continue try: uris.add(urlquote(uri)) except: # link şema dandik pass return uris