def getDocument(self, url, file_path, file_name, deep, error_count): if 0 <= recursion_deep < deep or error_count > document_error_max: return url = urlTools.dealUrl2Request(url, url) if file_path == '' and file_name == '': file_name = 'index.html' try: req = self.requestGet(url) charset = self.getHTMLCharset(req.content) req.encoding = charset d = pq(req.text) # print charset linkList1 = d('link') self.dealSourceLink(linkList1, Url(req.url), 'href') linkList2 = d('script') self.dealSourceLink(linkList2, Url(req.url), 'src') linkList3 = d('img') self.dealSourceLink(linkList3, Url(req.url), 'src') linkList4 = d('a') self.dealALink(linkList4, Url(req.url), 'href', deep) self.source_task.start() self.saveFile(file_path, file_name, bytearray(source=d.outer_html(), encoding='utf-8')) except requests.exceptions.ConnectionError, e: print 'ConnectionError:', e self.queue_document.push([url, file_path, file_name, deep, error_count + 1])
def getDocument(self, url, file_path, file_name, deep, error_count): if 0 <= recursion_deep < deep or error_count > document_error_max: return url = urlTools.dealUrl2Request(url, url) if file_path == '' and file_name == '': file_name = 'index.html' try: req = self.requestGet(url) charset = self.getHTMLCharset(req.content) req.encoding = charset d = pq(req.text) # print charset linkList1 = d('link') self.dealSourceLink(linkList1, Url(req.url), 'href') linkList2 = d('script') self.dealSourceLink(linkList2, Url(req.url), 'src') linkList3 = d('img') self.dealSourceLink(linkList3, Url(req.url), 'src') linkList4 = d('a') self.dealALink(linkList4, Url(req.url), 'href', deep) self.source_task.start() self.saveFile(file_path, file_name, bytearray(source=d.outer_html(), encoding='utf-8')) except requests.exceptions.ConnectionError, e: print 'ConnectionError:', e self.queue_document.push( [url, file_path, file_name, deep, error_count + 1])
def dealSourceLink(self, linkList, origin_url, attr): for li in linkList: url = pq(li).attr(attr) if url is not None: url = Url(url) request_url = urlTools.dealUrl2Request(url, origin_url) if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True) error_count = 0 self.queue_resource.push([request_url, file_path, file_name, error_count]) self.set[request_url] = [file_path, file_name, html_url] pq(li).attr(attr, html_url)
def dealALink(self, linkList, origin_url, attr, deep): for li in linkList: url = pq(li).attr(attr) if url is not None: url = Url(url) request_url = urlTools.dealUrl2Request(url, origin_url) # print 'A:', request_url if outsite_page or request_url.getHost()[1] == self.host: if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True) self.queue_document.push([request_url, file_path, file_name, deep + 1, 0]) self.set[request_url] = [file_path, file_name, html_url] pq(li).attr(attr, html_url)
def dealSourceLink(self, linkList, origin_url, attr): for li in linkList: url = pq(li).attr(attr) if url is not None: url = Url(url) request_url = urlTools.dealUrl2Request(url, origin_url) if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File( request_url, origin_url, self.host, True) error_count = 0 self.queue_resource.push( [request_url, file_path, file_name, error_count]) self.set[request_url] = [file_path, file_name, html_url] pq(li).attr(attr, html_url)
def dealCss(self, text, origin_url): list = re.findall(r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)', text) for ans_list in list: for li in ans_list: if li != '' and not li.startswith('data'): request_url = urlTools.dealUrl2Request(li, origin_url) if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True) error_count = 0 self.queue_resource.push([request_url, file_path, file_name, error_count]) self.set[request_url] = [file_path, file_name, html_url] # self.requestSource(request_url, file_path, file_name) text = text.replace(li, html_url.encode()) return text
def dealALink(self, linkList, origin_url, attr, deep): for li in linkList: url = pq(li).attr(attr) if url is not None: url = Url(url) request_url = urlTools.dealUrl2Request(url, origin_url) # print 'A:', request_url if outsite_page or request_url.getHost()[1] == self.host: if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File( request_url, origin_url, self.host, True) self.queue_document.push( [request_url, file_path, file_name, deep + 1, 0]) self.set[request_url] = [ file_path, file_name, html_url ] pq(li).attr(attr, html_url)
def dealCss(self, text, origin_url): list = re.findall(r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)', text) for ans_list in list: for li in ans_list: if li != '' and not li.startswith('data'): request_url = urlTools.dealUrl2Request(li, origin_url) if request_url in self.set: file_path, file_name, html_url = self.set[request_url] else: file_path, file_name, html_url = urlTools.dealUrl2File( request_url, origin_url, self.host, True) error_count = 0 self.queue_resource.push( [request_url, file_path, file_name, error_count]) self.set[request_url] = [ file_path, file_name, html_url ] # self.requestSource(request_url, file_path, file_name) text = text.replace(li, html_url.encode()) return text
def start(self, is_continue=False): if is_continue: self.queue_resource = Queue.load(logs_path + 'queue_resource.json') self.queue_document = Queue.load(logs_path + 'queue_document.json') self.set = UrlSet.load(logs_path + 'url_set.json') else: self.queue_resource = Queue(logs_path + 'queue_resource.json') self.queue_document = Queue(logs_path + 'queue_document.json') self.set = UrlSet(logs_path + 'url_set.json') self.document_task = Task(self.queue_document, doc_pool_max) self.document_task.initTaskWork(self.getDocument) self.source_task = Task(self.queue_resource, res_pool_max) self.source_task.initTaskWork(self.requestSource) self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url) print self.main_url, self.host file_path, file_name, html_url = urlTools.dealUrl2File(self.main_url, self.main_url, self.host, True) self.queue_document.push([self.main_url, file_path, file_name, 0, 0]) print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url self.document_task.start()
def start(self, is_continue=False): if is_continue: self.queue_resource = Queue.load(logs_path + 'queue_resource.json') self.queue_document = Queue.load(logs_path + 'queue_document.json') self.set = UrlSet.load(logs_path + 'url_set.json') else: self.queue_resource = Queue(logs_path + 'queue_resource.json') self.queue_document = Queue(logs_path + 'queue_document.json') self.set = UrlSet(logs_path + 'url_set.json') self.document_task = Task(self.queue_document, doc_pool_max) self.document_task.initTaskWork(self.getDocument) self.source_task = Task(self.queue_resource, res_pool_max) self.source_task.initTaskWork(self.requestSource) self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url) print self.main_url, self.host file_path, file_name, html_url = urlTools.dealUrl2File( self.main_url, self.main_url, self.host, True) self.queue_document.push([self.main_url, file_path, file_name, 0, 0]) print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url self.document_task.start()