def extract(self, body, url): tree = lxml.html.document_fromstring(body) a_elements = tree.xpath("//a") urls = valid_a_href(a_elements) self.save_result(url, urls) not_exist = self.url_dedup.insert_not_exist(urls) self.logger.info("not exist urls. urls=%s" % str(not_exist)) return not_exist
def process_body(self, body, task): url = task.get('url') #print url, body[:100][:1000] body_size = len(body) body = to_unicode(body) body.replace('<?xml version="1.0" encoding="utf-8"?>', '') #body = self.cleaner.clean_html(body) self.logger.info("page body, url:%s, body:%s" % (url, body[:100])) self.db_helper.save_mining_result(body, body_size, task) if task.get('depth') <= self.maxdepth: tree = lxml.html.document_fromstring(body) a_elements = tree.xpath('//a') #import pdb;pdb.set_trace() urls = valid_a_href(a_elements, url) not_exist = self.url_dedup.insert_not_exist(urls) #self.db_helper.insert_mining_task(task, urls) self.db_helper.insert_mining_task(task, not_exist)