def crawl(self, args, browser):
     source = args['source']
     list_page_url = args['list_page_url']
     try:
         appstore_modual = __import__(source)
     except ImportError:
         util.write_log_error(self.lock, self.logger, "can not find " + source + ".py in configured appstores")
         return False
     appstore_class = appstore_modual.Store()
     if self.url_rd.exists(util.get_MD5(list_page_url)):
         util.write_log_warn(self.lock, self.logger, "list_page_url:" + list_page_url + "was crawled in past %d seconds"%(config.CRAWLED_URL_REDIS_TIMEOUT))
         return False
     browser.desired_capabilities["phantomjs.page.settings.userAgent"] = appstore_class.PHANTOMJS_USER_AGENT
     util.browser_get_url(browser, list_page_url)
     # appstore_class.scroll_down(browser)
     check_more_count = appstore_class.click_checkmore(browser)
     util.write_log_warn(self.lock, self.logger, "列表页: " + list_page_url + "点击checkmore" + str(check_more_count) + "次后找不到checkmore")
     detail_urls = appstore_class.get_detail_urls(browser)
     if len(detail_urls) == 0:
         util.write_log_warn(self.lock, self.logger, "列表页: " + list_page_url + " 找到0个app")
     self.push_detail_page_message(args, detail_urls)
     next_list_page = appstore_class.click_nextpage(browser)
     if next_list_page:
         self.push_list_page_message(args, next_list_page)
     util.write_log_info(self.lock, self.logger,"SUCCESS : list_page_url: %s crawled"%(list_page_url))
     self.url_rd.set(util.get_MD5(list_page_url),1)
     self.url_rd.expire(util.get_MD5(list_page_url), config.CRAWLED_URL_REDIS_TIMEOUT)
     return True
Exemple #2
0
 def put_download_url_into_redis(self, download_url, response):
     md5 = util.get_MD5(download_url)
     if "last-modified" in response.headers:
         self.url_rd.set(md5, response.headers["last-modified"])
     else:
         self.url_rd.set(md5, 1)
         self.url_rd.expire(md5, config.CRAWLED_URL_REDIS_TIMEOUT)
     return
Exemple #3
0
 def put_download_url_into_redis(self, download_url, response):
     md5 = util.get_MD5(download_url)
     if 'last-modified' in response.headers:
         self.url_rd.set(md5, response.headers['last-modified'])
     else:
         self.url_rd.set(md5, 1)
         self.url_rd.expire(md5, config.CRAWLED_URL_REDIS_TIMEOUT)
     return
Exemple #4
0
 def is_url_need_to_download(self, download_url, response, target_folder,
                             download_file_type):
     md5 = util.get_MD5(download_url)
     if 'content-md5' in response.headers:
         apk_md5 = response.headers['content-md5']
         if os.exists(target_folder + 'cell-' + apk_md5[:2] + '/' +
                      apk_md5 + config.FILE_TYPE[download_file_type]
                      ['extension_filename']):
             return 'file with same md5 exists'
     if not self.url_rd.exists(md5):
         return 'yes'
     if 'last-modified' in response.headers:
         if response.headers['last-modified'] != self.url_rd.get(md5):
             return 'yes'
         else:
             return 'not modified'
     else:
         return ' has crawled in past %d seconds' % (
             config.CRAWLED_URL_REDIS_TIMEOUT)
Exemple #5
0
 def is_url_need_to_download(self, download_url, response, target_folder, download_file_type):
     md5 = util.get_MD5(download_url)
     if "content-md5" in response.headers:
         apk_md5 = response.headers["content-md5"]
         if os.exists(
             target_folder
             + "cell-"
             + apk_md5[:2]
             + "/"
             + apk_md5
             + config.FILE_TYPE[download_file_type]["extension_filename"]
         ):
             return "file with same md5 exists"
     if not self.url_rd.exists(md5):
         return "yes"
     if "last-modified" in response.headers:
         if response.headers["last-modified"] != self.url_rd.get(md5):
             return "yes"
         else:
             return "not modified"
     else:
         return " has crawled in past %d seconds" % (config.CRAWLED_URL_REDIS_TIMEOUT)
Exemple #6
0
 def crawl(self, args, browser):
     source = args['source']
     detail_page_url = args['detail_page_url']
     try:
         appstore_modual = __import__(source)
     except ImportError:
         util.write_log_error(self.lock, self.logger, "can not find " + source + ".py in configured appstores")
         return False
     appstore_class = appstore_modual.Store()
     if self.url_rd.exists(util.get_MD5(detail_page_url)):
         util.write_log_warn(self.lock, self.logger, "detail_page_url:" + detail_page_url + "was crawled in past %d seconds"%(config.CRAWLED_URL_REDIS_TIMEOUT))
         return False
     browser.desired_capabilities["phantomjs.page.settings.userAgent"] = appstore_class.PHANTOMJS_USER_AGENT
     util.browser_get_url(browser, detail_page_url)
     download_url = appstore_class.get_download_url(browser)
     if download_url is None:
         util.write_log_error(self.lock, self.logger, "详情页: " + detail_page_url + " 找不到download_url")
         return False
     out_json_message = appstore_class.make_detail2download_json_message(browser)
     out_json_message['detail_url'] = detail_page_url
     out_json_message['download_url'] = download_url
     if args['level'] = 'high':
         out_json_message['next_queue'] = config.HIGH_DOWNLOAD_QUEUE
Exemple #7
0
        util.browser_get_url(browser, detail_page_url)
        download_url = appstore_class.get_download_url(browser)
        if download_url is None:
            util.write_log_error(self.lock, self.logger, "详情页: " + detail_page_url + " 找不到download_url")
            return False
        out_json_message = appstore_class.make_detail2download_json_message(browser)
        out_json_message['detail_url'] = detail_page_url
        out_json_message['download_url'] = download_url
        if args['level'] = 'high':
            out_json_message['next_queue'] = config.HIGH_DOWNLOAD_QUEUE
        else:
            out_json_message['next_queue'] = config.LOW_DOWNLOAD_QUEUE
        out_json_message['level'] = args['level']
        out_json_message['message_type'] = args['message_type']
        util.write_log_info(self.lock, self.logger,"SUCCESS : detail_page_url: %s crawled"%(detail_page_url))
        self.url_rd.set(util.get_MD5(detail_page_url),1)
        self.url_rd.expire(util.get_MD5(detail_page_url), config.CRAWLED_URL_REDIS_TIMEOUT)
        return self.push_message(out_json_message)

    def normal(self, args, browser):
        util.write_log_warn(self.lock, self.logger, '"normal" message type did nothing.')
        return True

    def do_work(self, message, browser):
        util.random_sleep()
        args = self.prepare_args(message)
        if args is None:
            return
        return eval('self.'+args['message_type'])(args, browser)

    def work_process(self, browser):