def crawl(self, args, browser): source = args['source'] list_page_url = args['list_page_url'] try: appstore_modual = __import__(source) except ImportError: util.write_log_error(self.lock, self.logger, "can not find " + source + ".py in configured appstores") return False appstore_class = appstore_modual.Store() if self.url_rd.exists(util.get_MD5(list_page_url)): util.write_log_warn(self.lock, self.logger, "list_page_url:" + list_page_url + "was crawled in past %d seconds"%(config.CRAWLED_URL_REDIS_TIMEOUT)) return False browser.desired_capabilities["phantomjs.page.settings.userAgent"] = appstore_class.PHANTOMJS_USER_AGENT util.browser_get_url(browser, list_page_url) # appstore_class.scroll_down(browser) check_more_count = appstore_class.click_checkmore(browser) util.write_log_warn(self.lock, self.logger, "列表页: " + list_page_url + "点击checkmore" + str(check_more_count) + "次后找不到checkmore") detail_urls = appstore_class.get_detail_urls(browser) if len(detail_urls) == 0: util.write_log_warn(self.lock, self.logger, "列表页: " + list_page_url + " 找到0个app") self.push_detail_page_message(args, detail_urls) next_list_page = appstore_class.click_nextpage(browser) if next_list_page: self.push_list_page_message(args, next_list_page) util.write_log_info(self.lock, self.logger,"SUCCESS : list_page_url: %s crawled"%(list_page_url)) self.url_rd.set(util.get_MD5(list_page_url),1) self.url_rd.expire(util.get_MD5(list_page_url), config.CRAWLED_URL_REDIS_TIMEOUT) return True
def put_download_url_into_redis(self, download_url, response): md5 = util.get_MD5(download_url) if "last-modified" in response.headers: self.url_rd.set(md5, response.headers["last-modified"]) else: self.url_rd.set(md5, 1) self.url_rd.expire(md5, config.CRAWLED_URL_REDIS_TIMEOUT) return
def put_download_url_into_redis(self, download_url, response): md5 = util.get_MD5(download_url) if 'last-modified' in response.headers: self.url_rd.set(md5, response.headers['last-modified']) else: self.url_rd.set(md5, 1) self.url_rd.expire(md5, config.CRAWLED_URL_REDIS_TIMEOUT) return
def is_url_need_to_download(self, download_url, response, target_folder, download_file_type): md5 = util.get_MD5(download_url) if 'content-md5' in response.headers: apk_md5 = response.headers['content-md5'] if os.exists(target_folder + 'cell-' + apk_md5[:2] + '/' + apk_md5 + config.FILE_TYPE[download_file_type] ['extension_filename']): return 'file with same md5 exists' if not self.url_rd.exists(md5): return 'yes' if 'last-modified' in response.headers: if response.headers['last-modified'] != self.url_rd.get(md5): return 'yes' else: return 'not modified' else: return ' has crawled in past %d seconds' % ( config.CRAWLED_URL_REDIS_TIMEOUT)
def is_url_need_to_download(self, download_url, response, target_folder, download_file_type): md5 = util.get_MD5(download_url) if "content-md5" in response.headers: apk_md5 = response.headers["content-md5"] if os.exists( target_folder + "cell-" + apk_md5[:2] + "/" + apk_md5 + config.FILE_TYPE[download_file_type]["extension_filename"] ): return "file with same md5 exists" if not self.url_rd.exists(md5): return "yes" if "last-modified" in response.headers: if response.headers["last-modified"] != self.url_rd.get(md5): return "yes" else: return "not modified" else: return " has crawled in past %d seconds" % (config.CRAWLED_URL_REDIS_TIMEOUT)
def crawl(self, args, browser): source = args['source'] detail_page_url = args['detail_page_url'] try: appstore_modual = __import__(source) except ImportError: util.write_log_error(self.lock, self.logger, "can not find " + source + ".py in configured appstores") return False appstore_class = appstore_modual.Store() if self.url_rd.exists(util.get_MD5(detail_page_url)): util.write_log_warn(self.lock, self.logger, "detail_page_url:" + detail_page_url + "was crawled in past %d seconds"%(config.CRAWLED_URL_REDIS_TIMEOUT)) return False browser.desired_capabilities["phantomjs.page.settings.userAgent"] = appstore_class.PHANTOMJS_USER_AGENT util.browser_get_url(browser, detail_page_url) download_url = appstore_class.get_download_url(browser) if download_url is None: util.write_log_error(self.lock, self.logger, "详情页: " + detail_page_url + " 找不到download_url") return False out_json_message = appstore_class.make_detail2download_json_message(browser) out_json_message['detail_url'] = detail_page_url out_json_message['download_url'] = download_url if args['level'] = 'high': out_json_message['next_queue'] = config.HIGH_DOWNLOAD_QUEUE
util.browser_get_url(browser, detail_page_url) download_url = appstore_class.get_download_url(browser) if download_url is None: util.write_log_error(self.lock, self.logger, "详情页: " + detail_page_url + " 找不到download_url") return False out_json_message = appstore_class.make_detail2download_json_message(browser) out_json_message['detail_url'] = detail_page_url out_json_message['download_url'] = download_url if args['level'] = 'high': out_json_message['next_queue'] = config.HIGH_DOWNLOAD_QUEUE else: out_json_message['next_queue'] = config.LOW_DOWNLOAD_QUEUE out_json_message['level'] = args['level'] out_json_message['message_type'] = args['message_type'] util.write_log_info(self.lock, self.logger,"SUCCESS : detail_page_url: %s crawled"%(detail_page_url)) self.url_rd.set(util.get_MD5(detail_page_url),1) self.url_rd.expire(util.get_MD5(detail_page_url), config.CRAWLED_URL_REDIS_TIMEOUT) return self.push_message(out_json_message) def normal(self, args, browser): util.write_log_warn(self.lock, self.logger, '"normal" message type did nothing.') return True def do_work(self, message, browser): util.random_sleep() args = self.prepare_args(message) if args is None: return return eval('self.'+args['message_type'])(args, browser) def work_process(self, browser):