Ejemplo n.º 1
0
 def dealSourceLink(self, linkList, origin_url, attr):
     for li in linkList:
         url = pq(li).attr(attr)
         if url is not None:
             url = Url(url)
             request_url = urlTools.dealUrl2Request(url, origin_url)
             if request_url in self.set:
                 file_path, file_name, html_url = self.set[request_url]
             else:
                 file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True)
                 error_count = 0
                 self.queue_resource.push([request_url, file_path, file_name, error_count])
                 self.set[request_url] = [file_path, file_name, html_url]
             pq(li).attr(attr, html_url)
Ejemplo n.º 2
0
 def dealALink(self, linkList, origin_url, attr, deep):
     for li in linkList:
         url = pq(li).attr(attr)
         if url is not None:
             url = Url(url)
             request_url = urlTools.dealUrl2Request(url, origin_url)
             # print 'A:', request_url
             if outsite_page or request_url.getHost()[1] == self.host:
                 if request_url in self.set:
                     file_path, file_name, html_url = self.set[request_url]
                 else:
                     file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True)
                     self.queue_document.push([request_url, file_path, file_name, deep + 1, 0])
                     self.set[request_url] = [file_path, file_name, html_url]
                 pq(li).attr(attr, html_url)
Ejemplo n.º 3
0
 def dealSourceLink(self, linkList, origin_url, attr):
     for li in linkList:
         url = pq(li).attr(attr)
         if url is not None:
             url = Url(url)
             request_url = urlTools.dealUrl2Request(url, origin_url)
             if request_url in self.set:
                 file_path, file_name, html_url = self.set[request_url]
             else:
                 file_path, file_name, html_url = urlTools.dealUrl2File(
                     request_url, origin_url, self.host, True)
                 error_count = 0
                 self.queue_resource.push(
                     [request_url, file_path, file_name, error_count])
                 self.set[request_url] = [file_path, file_name, html_url]
             pq(li).attr(attr, html_url)
Ejemplo n.º 4
0
    def dealCss(self, text, origin_url):
        list = re.findall(r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)', text)
        for ans_list in list:
            for li in ans_list:
                if li != '' and not li.startswith('data'):
                    request_url = urlTools.dealUrl2Request(li, origin_url)
                    if request_url in self.set:
                        file_path, file_name, html_url = self.set[request_url]
                    else:
                        file_path, file_name, html_url = urlTools.dealUrl2File(request_url, origin_url, self.host, True)
                        error_count = 0
                        self.queue_resource.push([request_url, file_path, file_name, error_count])
                        self.set[request_url] = [file_path, file_name, html_url]
                    # self.requestSource(request_url, file_path, file_name)
                    text = text.replace(li, html_url.encode())

        return text
Ejemplo n.º 5
0
 def dealALink(self, linkList, origin_url, attr, deep):
     for li in linkList:
         url = pq(li).attr(attr)
         if url is not None:
             url = Url(url)
             request_url = urlTools.dealUrl2Request(url, origin_url)
             # print 'A:', request_url
             if outsite_page or request_url.getHost()[1] == self.host:
                 if request_url in self.set:
                     file_path, file_name, html_url = self.set[request_url]
                 else:
                     file_path, file_name, html_url = urlTools.dealUrl2File(
                         request_url, origin_url, self.host, True)
                     self.queue_document.push(
                         [request_url, file_path, file_name, deep + 1, 0])
                     self.set[request_url] = [
                         file_path, file_name, html_url
                     ]
                 pq(li).attr(attr, html_url)
Ejemplo n.º 6
0
    def dealCss(self, text, origin_url):
        list = re.findall(r'url\(\'(.*?)\'\)|url\(\"(.*?)\"\)|url\((.*?)\)',
                          text)
        for ans_list in list:
            for li in ans_list:
                if li != '' and not li.startswith('data'):
                    request_url = urlTools.dealUrl2Request(li, origin_url)
                    if request_url in self.set:
                        file_path, file_name, html_url = self.set[request_url]
                    else:
                        file_path, file_name, html_url = urlTools.dealUrl2File(
                            request_url, origin_url, self.host, True)
                        error_count = 0
                        self.queue_resource.push(
                            [request_url, file_path, file_name, error_count])
                        self.set[request_url] = [
                            file_path, file_name, html_url
                        ]
                    # self.requestSource(request_url, file_path, file_name)
                    text = text.replace(li, html_url.encode())

        return text
Ejemplo n.º 7
0
    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()
Ejemplo n.º 8
0
    def start(self, is_continue=False):
        if is_continue:
            self.queue_resource = Queue.load(logs_path + 'queue_resource.json')
            self.queue_document = Queue.load(logs_path + 'queue_document.json')
            self.set = UrlSet.load(logs_path + 'url_set.json')
        else:
            self.queue_resource = Queue(logs_path + 'queue_resource.json')
            self.queue_document = Queue(logs_path + 'queue_document.json')
            self.set = UrlSet(logs_path + 'url_set.json')

        self.document_task = Task(self.queue_document, doc_pool_max)
        self.document_task.initTaskWork(self.getDocument)

        self.source_task = Task(self.queue_resource, res_pool_max)
        self.source_task.initTaskWork(self.requestSource)

        self.main_url = urlTools.dealUrl2Request(self.main_url, self.main_url)
        print self.main_url, self.host
        file_path, file_name, html_url = urlTools.dealUrl2File(
            self.main_url, self.main_url, self.host, True)
        self.queue_document.push([self.main_url, file_path, file_name, 0, 0])
        print "file_path:" + file_path, "file_name:" + file_name, "html_url:" + html_url
        self.document_task.start()