def parse_api(self, response): meta = response.request.meta source = meta['domain'] all_link = [] url = response.request.url json_data = json.loads(response.body) for v in json_data['info']['value']: aid = v['appid'] link = "http://android.myapp.com/android/appdetail.jsp?appid=%s" % aid if link not in all_link: if url.find('qryrelate') == -1: related_api = 'http://android.myapp.com/android/qryrelatesoft_web?appid=%s&icontype=60&pageNo=1&pageSize=50' % aid all_link.append(related_api) all_link.append(link) if all_link: if self.name.startswith('update.'): service.report_update_status([LinkStatus(link, source, Status.FOUND, LinkType.CATELOG) for link in all_link]) service.report_update_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link))]) else: service.report_status([LinkStatus(link, source, Status.FOUND, LinkType.CATELOG) for link in all_link]) service.report_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link))])
def process_spider_input(self, response, spider): # common case if 200 <= response.status < 300: return if 'handle_httpstatus_list' in response.request.meta: allowed_statuses = response.request.meta['handle_httpstatus_list'] else: allowed_statuses = getattr(spider, 'handle_httpstatus_list', ()) if response.status in allowed_statuses: return source = response.request.meta['domain'] # report to server if spider.name.startswith('update.'): service.report_update_status([ LinkStatus(response.request.url, source, Status.FAIL, LinkType.UNKNOWN) ]) else: service.report_status([ LinkStatus(response.request.url, source, Status.FAIL, LinkType.UNKNOWN) ]) # report to monitor market.report_link(source, 'http_error', response.request.url, response.status)
def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), spider=spider, level=log.DEBUG) if spider.redirectprocessor_class: processor = spider.redirectprocessor_class() processor.process(redirected, request, spider, reason) # report redirect status, usually this link is not valid elif reason in [301, 302, 307]: source = request.meta['domain'] if spider.name.startswith('update.'): service.report_update_status([LinkStatus(request.url, source, Status.REDIRECT, LinkType.UNKNOWN)]) else: service.report_status([LinkStatus(request.url, source, Status.REDIRECT, LinkType.UNKNOWN)]) return redirected else: log.msg("Discarding %s: max redirections reached" % request, spider=spider, level=log.DEBUG) raise IgnoreRequest
def parse_item(self, response): meta = response.request.meta source = meta['domain'] log_info('parse_item_1===========') #source = 'appchina.com' url = response.request.url if self.sourcelinkprocessor_class: processor = self.sourcelinkprocessor_class() url = processor.process(url) if not self._process_response(response, source, LinkType.LEAF): service.report_status([ LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type) ]) market.remove_app(url, source) log_info('parse_item_2===========') return if not self.name.startswith( 'update.') and self.name != 'itunes.apple.com': self.parse(response) if source.endswith('hiapk.com'): body = response.body.replace('</br>', '<p>') response = response.replace(body=body) if not self.itemloader_class: log_info('parse_item_3===========') return try: selector = HtmlXPathSelector(response) try: loader = self.itemloader_class(selector, response=response) except: loader = self.itemloader_class(selector) # log_info("loader=====%s" % type(loader)) loader.add_value('source', source) loader.add_value('source_link', url) except Exception as e: log_info('parse_item_4===========\n%s' % e) log_error(e) if self.name.startswith('update.'): service.report_update_status( [LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)]) else: service.report_status( [LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)]) log_info('parse_item_5===========') try: item = loader.load_item() if (self.is_item_valid(item)): return item else: market.remove_app(url, source) except Exception as e: log_error(e)
def process(self, redirected, request, spider, reason): pattern_string = '^http://%s' % spider.name pattern = re.compile(pattern_string) redirected_url = redirected.url redirected_match = pattern.match(redirected_url) request_match = pattern.match(request.url) if redirected_match is None and request_match: if spider.sourcelinkprocessor_class: processor = spider.sourcelinkprocessor_class() request_url = processor.process(request.url) service.report_status([LinkStatus(request_url, spider.name, Status.FAIL, LinkType.LEAF)])
def parse_item(self, response): meta = response.request.meta source = meta['domain'] log_info('parse_item_1===========') #source = 'appchina.com' url = response.request.url if self.sourcelinkprocessor_class: processor = self.sourcelinkprocessor_class() url = processor.process(url) if not self._process_response(response, source, LinkType.LEAF): service.report_status([LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type)]) market.remove_app(url, source) log_info('parse_item_2===========') return if not self.name.startswith('update.') and self.name != 'itunes.apple.com': self.parse(response) if source.endswith('hiapk.com'): body = response.body.replace('</br>', '<p>') response = response.replace(body=body) if not self.itemloader_class: log_info('parse_item_3===========') return try: selector = HtmlXPathSelector(response) try: loader = self.itemloader_class(selector, response=response) except: loader = self.itemloader_class(selector) # log_info("loader=====%s" % type(loader)) loader.add_value('source', source) loader.add_value('source_link', url) except Exception as e: log_info('parse_item_4===========\n%s' % e) log_error(e) if self.name.startswith('update.'): service.report_update_status([LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)]) else: service.report_status([LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)]) log_info('parse_item_5===========' ) try: item = loader.load_item() if (self.is_item_valid(item)): return item else: market.remove_app(url, source) except Exception as e: log_error(e)
def _process_response(self, response, source, type): ''' Returns True if response can be further processed otherwise False. ''' url = response.request.url if self.sourcefilterprocessor_class: processor = self.sourcefilterprocessor_class() url = processor.process(self, url) if url is None: service.report_status([LinkStatus(url, source, Status.FAIL, type)]) return False if self.sourcelinkprocessor_class: processor = self.sourcelinkprocessor_class() url = processor.process(url) if url is None: service.report_status([LinkStatus(url, source, Status.FAIL, type)]) return False if response.status == 200: if self.name.startswith('update.'): pass service.report_update_status([LinkStatus(url, source, Status.SUCCEED, type)]) else: service.report_status([LinkStatus(url, source, Status.SUCCEED, type)]) return True else: if self.name.startswith('update.'): pass service.report_update_status([LinkStatus(url, source, Status.SUCCEED, type)]) else: service.report_status([LinkStatus(url, source, Status.FAIL, type)]) return False
def process(self, redirected, request, spider, reason): pattern_string = '^http://%s' % spider.name pattern = re.compile(pattern_string) redirected_url = redirected.url redirected_match = pattern.match(redirected_url) request_match = pattern.match(request.url) if redirected_match is None and request_match: if spider.sourcelinkprocessor_class: processor = spider.sourcelinkprocessor_class() request_url = processor.process(request.url) service.report_status([ LinkStatus(request_url, spider.name, Status.FAIL, LinkType.LEAF) ])
def parse_zhushou_cate(self, response): meta = response.request.meta source = meta['domain'] url = response.request.url import simplejson data = simplejson.loads(response.body) pns = jsonutils.find_attr(data, 'packageName', str) urls = ['http://www.wandoujia.com/apps/%s' % pn for pn in pns] if urls: if self.name.startswith('update.'): service.report_update_status([LinkStatus(u, source, Status.FOUND, LinkType.LEAF) for u in urls]) service.report_update_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(urls))]) else: service.report_status([LinkStatus(u, source, Status.FOUND, LinkType.LEAF) for u in urls]) service.report_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(urls))])
def parse(self, response): meta = response.request.meta source = meta['domain'] all_link = [] url = response.request.url if not self._process_response(response, source, LinkType.CATELOG): return rule_dicts = meta['rules'] rules = self._get_rules(source, rule_dicts) other_links = [] if source.endswith('hiapk.com'): other_links = get_app_list(url) for rule in rules: links = [l for l in rule.link_extractor.extract_links(response)] links.extend(other_links) if links and rule.process_links: links = rule.process_links(links) for link in links: if link not in all_link: all_link.append(link) if all_link: if self.name.startswith('update.'): service.report_update_status([ LinkStatus(link.url, source, Status.FOUND, rule.link_type) for link in all_link ]) service.report_update_status([ LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link)) ]) else: service.report_status([ LinkStatus(link.url, source, Status.FOUND, rule.link_type) for link in all_link ]) service.report_status([ LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link)) ]) for link in all_link: yield self._create_request(link.url)
def parse_zhushou_cate(self, response): meta = response.request.meta source = meta['domain'] url = response.request.url import simplejson data = simplejson.loads(response.body) # docids = [d['docid'] for d in data['result']['data']] if 'data' in data['result'] else [] docids = jsonutils.find_attr(data, 'docid', (str, int)) urls = ['http://as.baidu.com/a/item?docid=%s' % i for i in docids] if urls: if self.name.startswith('update.'): service.report_update_status([LinkStatus(u, source, Status.FOUND, LinkType.LEAF) for u in urls]) service.report_update_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(urls))]) else: service.report_status([LinkStatus(u, source, Status.FOUND, LinkType.LEAF) for u in urls]) service.report_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(urls))])
def process_spider_input(self, response, spider): # common case if 200 <= response.status < 300: return if 'handle_httpstatus_list' in response.request.meta: allowed_statuses = response.request.meta['handle_httpstatus_list'] else: allowed_statuses = getattr(spider, 'handle_httpstatus_list', ()) if response.status in allowed_statuses: return source = response.request.meta['domain'] # report to server if spider.name.startswith('update.'): service.report_update_status([LinkStatus(response.request.url, source, Status.FAIL, LinkType.UNKNOWN)]) else: service.report_status([LinkStatus(response.request.url, source, Status.FAIL, LinkType.UNKNOWN)]) # report to monitor market.report_link(source, 'http_error', response.request.url, response.status)
def _redirect(self, redirected, request, spider, reason): ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times) redirects = request.meta.get('redirect_times', 0) + 1 if ttl and redirects <= self.max_redirect_times: redirected.meta['redirect_times'] = redirects redirected.meta['redirect_ttl'] = ttl - 1 redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \ [request.url] redirected.dont_filter = request.dont_filter redirected.priority = request.priority + self.priority_adjust log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request), spider=spider, level=log.DEBUG) if spider.redirectprocessor_class: processor = spider.redirectprocessor_class() processor.process(redirected, request, spider, reason) # report redirect status, usually this link is not valid elif reason in [301, 302, 307]: source = request.meta['domain'] if spider.name.startswith('update.'): service.report_update_status([ LinkStatus(request.url, source, Status.REDIRECT, LinkType.UNKNOWN) ]) else: service.report_status([ LinkStatus(request.url, source, Status.REDIRECT, LinkType.UNKNOWN) ]) return redirected else: log.msg("Discarding %s: max redirections reached" % request, spider=spider, level=log.DEBUG) raise IgnoreRequest
def parse(self, response): meta = response.request.meta source = meta['domain'] all_link = [] url = response.request.url if not self._process_response(response, source, LinkType.CATELOG): return rule_dicts = meta['rules'] rules = self._get_rules(source, rule_dicts) other_links = [] if source.endswith('hiapk.com'): other_links = get_app_list(url) for rule in rules: links = [l for l in rule.link_extractor.extract_links(response)] links.extend(other_links) if links and rule.process_links: links = rule.process_links(links) for link in links: if link not in all_link: all_link.append(link) if all_link: if self.name.startswith('update.'): service.report_update_status([LinkStatus(link.url, source, Status.FOUND, rule.link_type) for link in all_link]) service.report_update_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link))]) else: service.report_status([LinkStatus(link.url, source, Status.FOUND, rule.link_type) for link in all_link]) service.report_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link))]) for link in all_link: yield self._create_request(link.url)
def _process_response(self, response, source, type): ''' Returns True if response can be further processed otherwise False. ''' url = response.request.url if self.sourcefilterprocessor_class: processor = self.sourcefilterprocessor_class() url = processor.process(self, url) if url is None: service.report_status([LinkStatus(url, source, Status.FAIL, type)]) return False if self.sourcelinkprocessor_class: processor = self.sourcelinkprocessor_class() url = processor.process(url) if url is None: service.report_status([LinkStatus(url, source, Status.FAIL, type)]) return False if response.status == 200: if self.name.startswith('update.'): pass service.report_update_status( [LinkStatus(url, source, Status.SUCCEED, type)]) else: service.report_status( [LinkStatus(url, source, Status.SUCCEED, type)]) return True else: if self.name.startswith('update.'): pass service.report_update_status( [LinkStatus(url, source, Status.SUCCEED, type)]) else: service.report_status( [LinkStatus(url, source, Status.FAIL, type)]) return False
def remove_app(source_link, name): service.report_status( [LinkStatus(source_link, name, Status.FAIL, LinkType.UNKNOWN)])
def remove_app(source_link, name): service.report_status([LinkStatus(source_link, name, Status.FAIL, LinkType.UNKNOWN)])