Esempio n. 1
0
 def extract(self, body, url):
     tree = lxml.html.document_fromstring(body)
     a_elements = tree.xpath("//a")
     urls = valid_a_href(a_elements)
     self.save_result(url, urls)
     not_exist = self.url_dedup.insert_not_exist(urls)
     self.logger.info("not exist urls. urls=%s" % str(not_exist))
     return not_exist
Esempio n. 2
0
 def process_body(self, body, task):
     url = task.get('url')
     #print url, body[:100][:1000]
     body_size = len(body)
     body = to_unicode(body)
     body.replace('<?xml version="1.0" encoding="utf-8"?>', '')
     #body = self.cleaner.clean_html(body)
     self.logger.info("page body, url:%s, body:%s" % (url, body[:100]))
     self.db_helper.save_mining_result(body, body_size, task)
     if task.get('depth') <= self.maxdepth:
         tree = lxml.html.document_fromstring(body)
         a_elements = tree.xpath('//a')
         #import pdb;pdb.set_trace()
         urls = valid_a_href(a_elements, url)
         not_exist = self.url_dedup.insert_not_exist(urls)
         #self.db_helper.insert_mining_task(task, urls)
         self.db_helper.insert_mining_task(task, not_exist)