Beispiel #1
0
 def process_spider_input(self, response, spider):
     # common case
     if 200 <= response.status < 300:
         return
     if 'handle_httpstatus_list' in response.request.meta:
         allowed_statuses = response.request.meta['handle_httpstatus_list']
     else:
         allowed_statuses = getattr(spider, 'handle_httpstatus_list', ())
     if response.status in allowed_statuses:
         return
     source = response.request.meta['domain']
     # report to server
     if spider.name.startswith('update.'):
         service.report_update_status([
             LinkStatus(response.request.url, source, Status.FAIL,
                        LinkType.UNKNOWN)
         ])
     else:
         service.report_status([
             LinkStatus(response.request.url, source, Status.FAIL,
                        LinkType.UNKNOWN)
         ])
     # report to monitor
     market.report_link(source, 'http_error', response.request.url,
                        response.status)
Beispiel #2
0
    def parse_item(self, response):
        meta = response.request.meta
        source = meta['domain']
        log_info('parse_item_1===========')
        #source = 'appchina.com'
        url = response.request.url
        if self.sourcelinkprocessor_class:
            processor = self.sourcelinkprocessor_class()
            url = processor.process(url)

        if not self._process_response(response, source, LinkType.LEAF):
            service.report_status([
                LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type)
            ])
            market.remove_app(url, source)
            log_info('parse_item_2===========')
            return

        if not self.name.startswith(
                'update.') and self.name != 'itunes.apple.com':
            self.parse(response)

        if source.endswith('hiapk.com'):
            body = response.body.replace('</br>', '<p>')
            response = response.replace(body=body)

        if not self.itemloader_class:
            log_info('parse_item_3===========')
            return

        try:
            selector = HtmlXPathSelector(response)
            try:
                loader = self.itemloader_class(selector, response=response)
            except:
                loader = self.itemloader_class(selector)
            # log_info("loader=====%s" %  type(loader))
            loader.add_value('source', source)
            loader.add_value('source_link', url)
        except Exception as e:
            log_info('parse_item_4===========\n%s' % e)
            log_error(e)
            if self.name.startswith('update.'):
                service.report_update_status(
                    [LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)])
            else:
                service.report_status(
                    [LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)])

        log_info('parse_item_5===========')
        try:
            item = loader.load_item()
            if (self.is_item_valid(item)):
                return item
            else:
                market.remove_app(url, source)
        except Exception as e:
            log_error(e)
Beispiel #3
0
    def parse(self, response):
        meta = response.request.meta
        source = meta['domain']
        all_link = []
        url = response.request.url
        if not self._process_response(response, source, LinkType.CATELOG):
            return

        rule_dicts = meta['rules']
        rules = self._get_rules(source, rule_dicts)

        other_links = []
        if source.endswith('hiapk.com'):
            other_links = get_app_list(url)

        for rule in rules:
            links = [l for l in rule.link_extractor.extract_links(response)]
            links.extend(other_links)
            if links and rule.process_links:
                links = rule.process_links(links)

            for link in links:
                if link not in all_link:
                    all_link.append(link)

        if all_link:
            if self.name.startswith('update.'):
                service.report_update_status([
                    LinkStatus(link.url, source, Status.FOUND, rule.link_type)
                    for link in all_link
                ])
                service.report_update_status([
                    LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG,
                               len(all_link))
                ])
            else:
                service.report_status([
                    LinkStatus(link.url, source, Status.FOUND, rule.link_type)
                    for link in all_link
                ])
                service.report_status([
                    LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG,
                               len(all_link))
                ])

        for link in all_link:
            yield self._create_request(link.url)
Beispiel #4
0
    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            log.msg("Redirecting (%s) to %s from %s" %
                    (reason, redirected, request),
                    spider=spider,
                    level=log.DEBUG)

            if spider.redirectprocessor_class:
                processor = spider.redirectprocessor_class()
                processor.process(redirected, request, spider, reason)

            # report redirect status, usually this link is not valid
            elif reason in [301, 302, 307]:
                source = request.meta['domain']
                if spider.name.startswith('update.'):
                    service.report_update_status([
                        LinkStatus(request.url, source, Status.REDIRECT,
                                   LinkType.UNKNOWN)
                    ])
                else:
                    service.report_status([
                        LinkStatus(request.url, source, Status.REDIRECT,
                                   LinkType.UNKNOWN)
                    ])

            return redirected
        else:
            log.msg("Discarding %s: max redirections reached" % request,
                    spider=spider,
                    level=log.DEBUG)
            raise IgnoreRequest
    def process(self, redirected, request, spider, reason):
        pattern_string = '^http://%s' % spider.name
        pattern = re.compile(pattern_string)

        redirected_url = redirected.url
        redirected_match = pattern.match(redirected_url)
        request_match = pattern.match(request.url)
        if redirected_match is None and request_match:
            if spider.sourcelinkprocessor_class:
                processor = spider.sourcelinkprocessor_class()
                request_url = processor.process(request.url)
            service.report_status([
                LinkStatus(request_url, spider.name, Status.FAIL,
                           LinkType.LEAF)
            ])
Beispiel #6
0
    def _process_response(self, response, source, type):
        '''
        Returns True if response can be further processed otherwise False.
        '''
        url = response.request.url
        if self.sourcefilterprocessor_class:
            processor = self.sourcefilterprocessor_class()
            url = processor.process(self, url)

        if url is None:
            service.report_status([LinkStatus(url, source, Status.FAIL, type)])
            return False

        if self.sourcelinkprocessor_class:
            processor = self.sourcelinkprocessor_class()
            url = processor.process(url)

        if url is None:
            service.report_status([LinkStatus(url, source, Status.FAIL, type)])
            return False

        if response.status == 200:
            if self.name.startswith('update.'):
                pass
                service.report_update_status(
                    [LinkStatus(url, source, Status.SUCCEED, type)])
            else:
                service.report_status(
                    [LinkStatus(url, source, Status.SUCCEED, type)])
            return True
        else:
            if self.name.startswith('update.'):
                pass
                service.report_update_status(
                    [LinkStatus(url, source, Status.SUCCEED, type)])
            else:
                service.report_status(
                    [LinkStatus(url, source, Status.FAIL, type)])
            return False
Beispiel #7
0
def remove_app(source_link, name):
    service.report_status(
        [LinkStatus(source_link, name, Status.FAIL, LinkType.UNKNOWN)])