def dealUrl2Request(self, url, origin): origin = Url(origin) url = Url(url) if not url.startswith('http://') and not url.startswith('https://'): if url.startswith('//'): url = 'http:' + url elif url.startswith('/'): url = origin.getHost()[2] + url else: origin = origin.getUrlDir() url = origin + url url = url.simplifyUrl() return url
class Crawler: def __init__(self, url): self.main_url = Url(self.requestGet(url).url) self.host_option, self.host, self.host_url = self.main_url.getHost() self.queue_resource = None self.queue_document = None self.set = None self.document_task = None self.source_task = None def __del__(self): # del self.document_task # del self.queue_document # del self.source_task # del self.queue_resource print 'del c' def start(self, is_continue=False): if is_continue: self.queue_resource = Queue.load(logs_path + 'queue_resource.json') self.queue_document = Queue.load(logs_path + 'queue_document.json') self.set = UrlSet.load(logs_path + 'url_set.json') else: self.queue_resource = Queue(logs_path + 'queue_resource.json') self.queue_document = Queue(logs_path + 'queue_document.json') self.set = UrlSet(logs_path + 'url_set.json') self.document_task = Task(self.queue_document, doc_pool_max) self.document_task.initTaskWork(self.getDocument) self.source_task = Task(self.queue_resource, res_pool_max) self.source_task.initTaskWork(self.requestSource) self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url) print self.main_url, self.host file_path, file_name, html_url = urlTools.dealUrl2File( self.main_url, self.main_url, self.host, True) self.queue_document.push([self.main_url, file_path, file_name, 0, 0]) print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url self.document_task.start() def requestGet(self, url): wait = random.random() * (wait_time[1] - wait_time[0]) sleep(wait) timeout = Timeout(request_timeout) timeout.start() try: req = requests.get(url=url, verify=True, headers=headers, proxies=proxies) except IncompleteRead: pass # todo:未知错误,暂还未查清 timeout.cancel() return req def saveFile(self, file_path, file_name, bytes): path = Url(output + file_path) path = path.addUrlEnd() if not os.path.exists(path): os.makedirs(path) try: f = open(path + file_name, "wb") f.write(bytes) f.close() except IOError, e: print 'save Error: ', e, 'path: ', path, 'name: ', file_name
class Crawler: def __init__(self, url): self.main_url = Url(self.requestGet(url).url) self.host_option, self.host, self.host_url = self.main_url.getHost() self.queue_resource = None self.queue_document = None self.set = None self.document_task = None self.source_task = None def __del__(self): # del self.document_task # del self.queue_document # del self.source_task # del self.queue_resource print 'del c' def start(self, is_continue=False): if is_continue: self.queue_resource = Queue.load(logs_path + 'queue_resource.json') self.queue_document = Queue.load(logs_path + 'queue_document.json') self.set = UrlSet.load(logs_path + 'url_set.json') else: self.queue_resource = Queue(logs_path + 'queue_resource.json') self.queue_document = Queue(logs_path + 'queue_document.json') self.set = UrlSet(logs_path + 'url_set.json') self.document_task = Task(self.queue_document, doc_pool_max) self.document_task.initTaskWork(self.getDocument) self.source_task = Task(self.queue_resource, res_pool_max) self.source_task.initTaskWork(self.requestSource) self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url) print self.main_url, self.host file_path, file_name, html_url = urlTools.dealUrl2File(self.main_url, self.main_url, self.host, True) self.queue_document.push([self.main_url, file_path, file_name, 0, 0]) print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url self.document_task.start() def requestGet(self, url): wait = random.random() * (wait_time[1] - wait_time[0]) sleep(wait) timeout = Timeout(request_timeout) timeout.start() try: req = requests.get(url=url, verify=True, headers=headers, proxies=proxies) except IncompleteRead: pass # todo:未知错误,暂还未查清 timeout.cancel() return req def saveFile(self, file_path, file_name, bytes): path = Url(output + file_path) path = path.addUrlEnd() if not os.path.exists(path): os.makedirs(path) try: f = open(path + file_name, "wb") f.write(bytes) f.close() except IOError, e: print 'save Error: ', e, 'path: ', path, 'name: ', file_name