Example #1
0
    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()
Example #2
0
    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(
            self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()
Example #3
0
class Crawler:
    def __init__(self, url):
        self.main_url = Url(self.requestGet(url).url)
        self.host_option, self.host, self.host_url = self.main_url.getHost()

        self.queue_resource = None
        self.queue_document = None
        self.set = None
        self.document_task = None
        self.source_task = None

    def __del__(self):
        # del self.document_task
        # del self.queue_document
        # del self.source_task
        # del self.queue_resource
        print 'del c'

    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(
            self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()

    def requestGet(self, url):
        wait = random.random() * (wait_time[1] - wait_time[0])
        sleep(wait)
        timeout = Timeout(request_timeout)
        timeout.start()
        try:
            req = requests.get(url=url,
                               verify=True,
                               headers=headers,
                               proxies=proxies)
        except IncompleteRead:
            pass
            # todo:未知错误,暂还未查清
        timeout.cancel()
        return req

    def saveFile(self, file_path, file_name, bytes):
        path = Url(output + file_path)
        path = path.addUrlEnd()
        if not os.path.exists(path):
            os.makedirs(path)
        try:
            f = open(path + file_name, "wb")
            f.write(bytes)
            f.close()
        except IOError, e:
            print 'save Error: ', e, 'path: ', path, 'name: ', file_name
Example #4
0
class Crawler:
    def __init__(self, url):
        self.main_url = Url(self.requestGet(url).url)
        self.host_option, self.host, self.host_url = self.main_url.getHost()

        self.queue_resource = None
        self.queue_document = None
        self.set = None
        self.document_task = None
        self.source_task = None

    def __del__(self):
        # del self.document_task
        # del self.queue_document
        # del self.source_task
        # del self.queue_resource
        print 'del c'

    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()

    def requestGet(self, url):
        wait = random.random() * (wait_time[1] - wait_time[0])
        sleep(wait)
        timeout = Timeout(request_timeout)
        timeout.start()
        try:
            req = requests.get(url=url, verify=True, headers=headers, proxies=proxies)
        except IncompleteRead:
            pass
            # todo:未知错误,暂还未查清
        timeout.cancel()
        return req

    def saveFile(self, file_path, file_name, bytes):
        path = Url(output + file_path)
        path = path.addUrlEnd()
        if not os.path.exists(path):
            os.makedirs(path)
        try:
            f = open(path + file_name, "wb")
            f.write(bytes)
            f.close()
        except IOError, e:
            print 'save Error: ', e, 'path: ', path, 'name: ', file_name