def process_item(self, item, spider): if item.__class__ == AppIdentificationItem: obj, created = AppIdentification.objects.get_or_create( apk_name=item['apk_name']) if 'top_type' in item and (item['top_type'] != obj.top_type): obj.top_type = item['top_type'] obj.save() if 'category' in item: cat, is_created = Category.objects.get_or_create( name=item['category'], top_type=item['top_type']) if created: appinfo = AppInfo(app_id=obj, data_source=item['data_source']) appinfo.save() log.msg('Get new apk %s' % obj.apk_name, level=log.INFO) return item else: spider.log('Duplicate apk %s' % obj.apk_name, level=log.INFO) return if item.__class__ == AppInfoItem: app = item['instance'] # 基本信息 for key in APK_DETAILS_FILED_NAMES: setattr(app, key, item[key]) app.is_crawled = 1 app.last_crawl_time = datetime.now() app.save() # 相关信息 update_app_related(app, item) spider.log('update ok %s' % item['apk_name'], log.INFO) # sync data to Doraemon url = "%s/?apk_name=%s&force=%s" % ( self.crawler.settings['DATA_SYNC_API'], app.app_id.apk_name, self.crawler.settings.get('FORCE_UPDATE')) # 返回defer, 同步到Doraemon request = Request(url=url) request.callback = None request.errback = None dfd = self.crawler.engine.download(request, spider) dfd.addCallbacks(callback=self._sync_callback, callbackArgs=(item['apk_name'], spider), errback=self._sync_errback, errbackArgs=(item['apk_name'], spider)) dfd.addErrback(spider.log, level=log.ERROR) return dfd.addBoth(lambda _: item)
def process_item(self, item, spider): if item.__class__ == AppIdentificationItem: obj, created = AppIdentification.objects.get_or_create( apk_name=item['apk_name'] ) if 'top_type' in item and (item['top_type'] != obj.top_type): obj.top_type = item['top_type'] obj.save() if 'category' in item: cat, is_created = Category.objects.get_or_create(name=item['category'], top_type=item['top_type']) if created: appinfo = AppInfo(app_id=obj, data_source=item['data_source']) appinfo.save() log.msg('Get new apk %s' % obj.apk_name, level=log.INFO) return item else: spider.log('Duplicate apk %s' % obj.apk_name, level=log.INFO) return if item.__class__ == AppInfoItem: app = item['instance'] # 基本信息 for key in APK_DETAILS_FILED_NAMES: setattr(app, key, item[key]) app.is_crawled = 1 app.last_crawl_time = datetime.now() app.save() # 相关信息 update_app_related(app, item) spider.log('update ok %s' % item['apk_name'], log.INFO) # sync data to Doraemon url = "%s/?apk_name=%s&force=%s" % (self.crawler.settings['DATA_SYNC_API'], app.app_id.apk_name, self.crawler.settings.get('FORCE_UPDATE')) # 返回defer, 同步到Doraemon request = Request(url=url) request.callback = None request.errback = None dfd = self.crawler.engine.download(request, spider) dfd.addCallbacks( callback=self._sync_callback, callbackArgs=(item['apk_name'], spider), errback=self._sync_errback, errbackArgs=(item['apk_name'], spider)) dfd.addErrback(spider.log, level=log.ERROR) return dfd.addBoth(lambda _: item)
def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() self.expire_queues() # update the ip address every so often if t - self.update_ip_time > self.ip_update_interval: self.update_ip_time = t self.update_ipaddress() self.report_self() item = self.find_item() if item: self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) try: if 'callback' in item and item['callback'] is not None: req.callback = getattr(self.spider, item['callback']) except AttributeError: self.logger.warn("Unable to find callback method") try: if 'errback' in item and item['errback'] is not None: req.errback = getattr(self.spider, item['errback']) except AttributeError: self.logger.warn("Unable to find errback method") if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in list(item.keys()): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None
def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() item = self.find_item() if item: self.logger.info( 'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s' % (item["meta"]["url"] if 'meta' in item else item["url"])) self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = get_method(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = get_method(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None
def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() item = self.find_item() if item: self.logger.info( 'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s' % ( item["meta"]["url"] if 'meta' in item else item["url"])) self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = get_method(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = get_method(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None