Example #1
0
 def consume(self, process_id):
     while True:
         self.logger.log("Consumer process:" + str(process_id) + " fetch new image from queue")
         if not self.queue.empty():
             image_id, link = self.queue.get()
             self.logger.log("Consumer process:"+ str(process_id) + " start crawling " + str(link))
             image = common_utils.page_crawl(link)
             if image != None:
                 self.logger.log(link + "crawled successfully")
                 self.adapter.store_image(image_id, image)
             else:
                 self.logger.log(link + " failed at crawling")
                 self.adapter.update_image_status(image_id, ImageIndexStatus.DOWNLOAD_FAILED)
             self.queue.task_done()
             time.sleep(1)
         else:
             self.logger.log("Queue empty")
             time.sleep(10)
Example #2
0
    def run(self):
        while True:
            count = 0
            try:
                for url_hash, url in self.adapter.load_uncrawled_docs(BatchCrawler.MAX_DOCS_NUM):
                    count += 1
                    self.logger.log("crawling url %s"%url, 2)
                    page = common_utils.page_crawl(url)
                    if page == None:
                        self.adapter.update_doc_raw_as_crawled_failed(url_hash)
                        continue
                    if self.encode != "utf-8":
                        page = unicode(page, self.encode).encode("utf-8")

                    self.adapter.update_doc_raw_with_crawled_page(url_hash, "utf-8", page)
                    time.sleep(float(self.request_interval))
                if count < BatchCrawler.MAX_DOCS_NUM:
                    break
            except:
                self.logger.log("mongo error")
Example #3
0
 def consume(self, process_id):
     while True:
         self.logger.log("Consumer process:" + str(process_id) +
                         " fetch new image from queue")
         if not self.queue.empty():
             image_id, link = self.queue.get()
             self.logger.log("Consumer process:" + str(process_id) +
                             " start crawling " + str(link))
             image = common_utils.page_crawl(link)
             if image != None:
                 self.logger.log(link + "crawled successfully")
                 self.adapter.store_image(image_id, image)
             else:
                 self.logger.log(link + " failed at crawling")
                 self.adapter.update_image_status(
                     image_id, ImageIndexStatus.DOWNLOAD_FAILED)
             self.queue.task_done()
             time.sleep(1)
         else:
             self.logger.log("Queue empty")
             time.sleep(10)
Example #4
0
    def run(self):
        while True:
            count = 0
            try:
                for url_hash, url in self.adapter.load_uncrawled_docs(
                        BatchCrawler.MAX_DOCS_NUM):
                    count += 1
                    self.logger.log("crawling url %s" % url, 2)
                    page = common_utils.page_crawl(url)
                    if page == None:
                        self.adapter.update_doc_raw_as_crawled_failed(url_hash)
                        continue
                    if self.encode != "utf-8":
                        page = unicode(page, self.encode).encode("utf-8")

                    self.adapter.update_doc_raw_with_crawled_page(
                        url_hash, "utf-8", page)
                    time.sleep(float(self.request_interval))
                if count < BatchCrawler.MAX_DOCS_NUM:
                    break
            except:
                self.logger.log("mongo error")