Exemple #1
0
    def parse_api(self, response):
        meta = response.request.meta
        source = meta['domain']
        all_link = []
        url = response.request.url

        json_data = json.loads(response.body)

        for v in json_data['info']['value']:
            aid = v['appid']
            link = "http://android.myapp.com/android/appdetail.jsp?appid=%s" % aid
            if link not in all_link:
                if url.find('qryrelate') == -1:
                    related_api = 'http://android.myapp.com/android/qryrelatesoft_web?appid=%s&icontype=60&pageNo=1&pageSize=50' % aid
                    all_link.append(related_api)
                all_link.append(link)

        if all_link:
            if self.name.startswith('update.'):
                service.report_update_status([LinkStatus(link, source, Status.FOUND, LinkType.CATELOG)
                                             for link in all_link])
                service.report_update_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link))])
            else:
                service.report_status([LinkStatus(link, source, Status.FOUND, LinkType.CATELOG) for link in all_link])
                service.report_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link))])
Exemple #2
0
 def process_spider_input(self, response, spider):
     # common case
     if 200 <= response.status < 300:
         return
     if 'handle_httpstatus_list' in response.request.meta:
         allowed_statuses = response.request.meta['handle_httpstatus_list']
     else:
         allowed_statuses = getattr(spider, 'handle_httpstatus_list', ())
     if response.status in allowed_statuses:
         return
     source = response.request.meta['domain']
     # report to server
     if spider.name.startswith('update.'):
         service.report_update_status([
             LinkStatus(response.request.url, source, Status.FAIL,
                        LinkType.UNKNOWN)
         ])
     else:
         service.report_status([
             LinkStatus(response.request.url, source, Status.FAIL,
                        LinkType.UNKNOWN)
         ])
     # report to monitor
     market.report_link(source, 'http_error', response.request.url,
                        response.status)
Exemple #3
0
    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            log.msg("Redirecting (%s) to %s from %s" % (reason, redirected, request),
                    spider=spider, level=log.DEBUG)

            if spider.redirectprocessor_class:
                processor = spider.redirectprocessor_class()
                processor.process(redirected, request, spider, reason)

            # report redirect status, usually this link is not valid
            elif reason in [301, 302, 307]:
                source = request.meta['domain']
                if spider.name.startswith('update.'):
                    service.report_update_status([LinkStatus(request.url, source, Status.REDIRECT, LinkType.UNKNOWN)])
                else:
                    service.report_status([LinkStatus(request.url, source, Status.REDIRECT, LinkType.UNKNOWN)])

            return redirected
        else:
            log.msg("Discarding %s: max redirections reached" % request,
                    spider=spider, level=log.DEBUG)
            raise IgnoreRequest
Exemple #4
0
    def parse_item(self, response):
        meta = response.request.meta
        source = meta['domain']
        log_info('parse_item_1===========')
        #source = 'appchina.com'
        url = response.request.url
        if self.sourcelinkprocessor_class:
            processor = self.sourcelinkprocessor_class()
            url = processor.process(url)

        if not self._process_response(response, source, LinkType.LEAF):
            service.report_status([
                LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type)
            ])
            market.remove_app(url, source)
            log_info('parse_item_2===========')
            return

        if not self.name.startswith(
                'update.') and self.name != 'itunes.apple.com':
            self.parse(response)

        if source.endswith('hiapk.com'):
            body = response.body.replace('</br>', '<p>')
            response = response.replace(body=body)

        if not self.itemloader_class:
            log_info('parse_item_3===========')
            return

        try:
            selector = HtmlXPathSelector(response)
            try:
                loader = self.itemloader_class(selector, response=response)
            except:
                loader = self.itemloader_class(selector)
            # log_info("loader=====%s" %  type(loader))
            loader.add_value('source', source)
            loader.add_value('source_link', url)
        except Exception as e:
            log_info('parse_item_4===========\n%s' % e)
            log_error(e)
            if self.name.startswith('update.'):
                service.report_update_status(
                    [LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)])
            else:
                service.report_status(
                    [LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)])

        log_info('parse_item_5===========')
        try:
            item = loader.load_item()
            if (self.is_item_valid(item)):
                return item
            else:
                market.remove_app(url, source)
        except Exception as e:
            log_error(e)
    def process(self, redirected, request, spider, reason):
        pattern_string = '^http://%s' % spider.name
        pattern = re.compile(pattern_string)

        redirected_url = redirected.url
        redirected_match = pattern.match(redirected_url)
        request_match = pattern.match(request.url)
        if redirected_match is None and request_match:
            if spider.sourcelinkprocessor_class:
                processor = spider.sourcelinkprocessor_class()
                request_url = processor.process(request.url)
            service.report_status([LinkStatus(request_url, spider.name, Status.FAIL, LinkType.LEAF)])
Exemple #6
0
    def parse_item(self, response):
        meta = response.request.meta
        source = meta['domain']
        log_info('parse_item_1===========')
        #source = 'appchina.com'
        url = response.request.url
        if self.sourcelinkprocessor_class:
            processor = self.sourcelinkprocessor_class()
            url = processor.process(url)

        if not self._process_response(response, source, LinkType.LEAF):
            service.report_status([LinkStatus(meta['redirect_urls'][0], source, Status.FAIL, type)])
            market.remove_app(url, source)
            log_info('parse_item_2===========')
            return

        if not self.name.startswith('update.') and self.name != 'itunes.apple.com':
            self.parse(response)

        if source.endswith('hiapk.com'):
            body = response.body.replace('</br>', '<p>')
            response = response.replace(body=body)

        if not self.itemloader_class:
            log_info('parse_item_3===========')
            return

        try:
            selector = HtmlXPathSelector(response)
            try:
                loader = self.itemloader_class(selector, response=response)
            except:
                loader = self.itemloader_class(selector)
            # log_info("loader=====%s" %  type(loader))
            loader.add_value('source', source)
            loader.add_value('source_link', url)
        except Exception as e:
            log_info('parse_item_4===========\n%s' %  e)
            log_error(e)
            if self.name.startswith('update.'):
                service.report_update_status([LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)])
            else:
                service.report_status([LinkStatus(url, source, Status.FAIL, LinkType.UNKNOWN)])

        log_info('parse_item_5===========' )
        try:
            item = loader.load_item()
            if (self.is_item_valid(item)):
                return item
            else:
                market.remove_app(url, source)
        except Exception as e:
            log_error(e)
Exemple #7
0
    def _process_response(self, response, source, type):
        '''
        Returns True if response can be further processed otherwise False.
        '''
        url = response.request.url
        if self.sourcefilterprocessor_class:
            processor = self.sourcefilterprocessor_class()
            url = processor.process(self, url)

        if url is None:
            service.report_status([LinkStatus(url, source, Status.FAIL, type)])
            return False

        if self.sourcelinkprocessor_class:
            processor = self.sourcelinkprocessor_class()
            url = processor.process(url)

        if url is None:
            service.report_status([LinkStatus(url, source, Status.FAIL, type)])
            return False

        if response.status == 200:
            if self.name.startswith('update.'):
                pass
                service.report_update_status([LinkStatus(url, source, Status.SUCCEED, type)])
            else:
                service.report_status([LinkStatus(url, source, Status.SUCCEED, type)])
            return True
        else:
            if self.name.startswith('update.'):
                pass
                service.report_update_status([LinkStatus(url, source, Status.SUCCEED, type)])
            else:
                service.report_status([LinkStatus(url, source, Status.FAIL, type)])
            return False
    def process(self, redirected, request, spider, reason):
        pattern_string = '^http://%s' % spider.name
        pattern = re.compile(pattern_string)

        redirected_url = redirected.url
        redirected_match = pattern.match(redirected_url)
        request_match = pattern.match(request.url)
        if redirected_match is None and request_match:
            if spider.sourcelinkprocessor_class:
                processor = spider.sourcelinkprocessor_class()
                request_url = processor.process(request.url)
            service.report_status([
                LinkStatus(request_url, spider.name, Status.FAIL,
                           LinkType.LEAF)
            ])
Exemple #9
0
    def parse_zhushou_cate(self, response):
        meta = response.request.meta
        source = meta['domain']
        url = response.request.url
        import simplejson
        data = simplejson.loads(response.body)
        pns = jsonutils.find_attr(data, 'packageName', str)
        urls = ['http://www.wandoujia.com/apps/%s' % pn for pn in pns]

        if urls:
            if self.name.startswith('update.'):
                service.report_update_status([LinkStatus(u, source, Status.FOUND, LinkType.LEAF) for u in urls])
                service.report_update_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(urls))])
            else:
                service.report_status([LinkStatus(u, source, Status.FOUND, LinkType.LEAF) for u in urls])
                service.report_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(urls))])
Exemple #10
0
    def parse(self, response):
        meta = response.request.meta
        source = meta['domain']
        all_link = []
        url = response.request.url
        if not self._process_response(response, source, LinkType.CATELOG):
            return

        rule_dicts = meta['rules']
        rules = self._get_rules(source, rule_dicts)

        other_links = []
        if source.endswith('hiapk.com'):
            other_links = get_app_list(url)

        for rule in rules:
            links = [l for l in rule.link_extractor.extract_links(response)]
            links.extend(other_links)
            if links and rule.process_links:
                links = rule.process_links(links)

            for link in links:
                if link not in all_link:
                    all_link.append(link)

        if all_link:
            if self.name.startswith('update.'):
                service.report_update_status([
                    LinkStatus(link.url, source, Status.FOUND, rule.link_type)
                    for link in all_link
                ])
                service.report_update_status([
                    LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG,
                               len(all_link))
                ])
            else:
                service.report_status([
                    LinkStatus(link.url, source, Status.FOUND, rule.link_type)
                    for link in all_link
                ])
                service.report_status([
                    LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG,
                               len(all_link))
                ])

        for link in all_link:
            yield self._create_request(link.url)
Exemple #11
0
    def parse_zhushou_cate(self, response):
        meta = response.request.meta
        source = meta['domain']
        url = response.request.url
        import simplejson
        data = simplejson.loads(response.body)
        # docids = [d['docid'] for d in data['result']['data']] if 'data' in data['result'] else []
        docids = jsonutils.find_attr(data, 'docid', (str, int))
        urls = ['http://as.baidu.com/a/item?docid=%s' % i for i in docids]

        if urls:
            if self.name.startswith('update.'):
                service.report_update_status([LinkStatus(u, source, Status.FOUND, LinkType.LEAF) for u in urls])
                service.report_update_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(urls))])
            else:
                service.report_status([LinkStatus(u, source, Status.FOUND, LinkType.LEAF) for u in urls])
                service.report_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(urls))])
Exemple #12
0
 def process_spider_input(self, response, spider):
     # common case
     if 200 <= response.status < 300:
         return
     if 'handle_httpstatus_list' in response.request.meta:
         allowed_statuses = response.request.meta['handle_httpstatus_list']
     else:
         allowed_statuses = getattr(spider, 'handle_httpstatus_list', ())
     if response.status in allowed_statuses:
         return
     source = response.request.meta['domain']
     # report to server
     if spider.name.startswith('update.'):
         service.report_update_status([LinkStatus(response.request.url, source, Status.FAIL, LinkType.UNKNOWN)])
     else:
         service.report_status([LinkStatus(response.request.url, source, Status.FAIL, LinkType.UNKNOWN)])
     # report to monitor
     market.report_link(source, 'http_error', response.request.url, response.status)
Exemple #13
0
    def _redirect(self, redirected, request, spider, reason):
        ttl = request.meta.setdefault('redirect_ttl', self.max_redirect_times)
        redirects = request.meta.get('redirect_times', 0) + 1

        if ttl and redirects <= self.max_redirect_times:
            redirected.meta['redirect_times'] = redirects
            redirected.meta['redirect_ttl'] = ttl - 1
            redirected.meta['redirect_urls'] = request.meta.get('redirect_urls', []) + \
                [request.url]
            redirected.dont_filter = request.dont_filter
            redirected.priority = request.priority + self.priority_adjust
            log.msg("Redirecting (%s) to %s from %s" %
                    (reason, redirected, request),
                    spider=spider,
                    level=log.DEBUG)

            if spider.redirectprocessor_class:
                processor = spider.redirectprocessor_class()
                processor.process(redirected, request, spider, reason)

            # report redirect status, usually this link is not valid
            elif reason in [301, 302, 307]:
                source = request.meta['domain']
                if spider.name.startswith('update.'):
                    service.report_update_status([
                        LinkStatus(request.url, source, Status.REDIRECT,
                                   LinkType.UNKNOWN)
                    ])
                else:
                    service.report_status([
                        LinkStatus(request.url, source, Status.REDIRECT,
                                   LinkType.UNKNOWN)
                    ])

            return redirected
        else:
            log.msg("Discarding %s: max redirections reached" % request,
                    spider=spider,
                    level=log.DEBUG)
            raise IgnoreRequest
Exemple #14
0
    def parse(self, response):
        meta = response.request.meta
        source = meta['domain']
        all_link = []
        url = response.request.url
        if not self._process_response(response, source, LinkType.CATELOG):
            return

        rule_dicts = meta['rules']
        rules = self._get_rules(source, rule_dicts)

        other_links = []
        if source.endswith('hiapk.com'):
            other_links = get_app_list(url)

        for rule in rules:
            links = [l for l in rule.link_extractor.extract_links(response)]
            links.extend(other_links)
            if links and rule.process_links:
                links = rule.process_links(links)

            for link in links:
                if link not in all_link:
                    all_link.append(link)

        if all_link:
            if self.name.startswith('update.'):
                service.report_update_status([LinkStatus(link.url, source, Status.FOUND, rule.link_type)
                                             for link in all_link])
                service.report_update_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link))])
            else:
                service.report_status([LinkStatus(link.url, source, Status.FOUND, rule.link_type) for link in all_link])
                service.report_status([LinkStatus(url, source, Status.SUCCEED, LinkType.CATELOG, len(all_link))])

        for link in all_link:
            yield self._create_request(link.url)
Exemple #15
0
    def _process_response(self, response, source, type):
        '''
        Returns True if response can be further processed otherwise False.
        '''
        url = response.request.url
        if self.sourcefilterprocessor_class:
            processor = self.sourcefilterprocessor_class()
            url = processor.process(self, url)

        if url is None:
            service.report_status([LinkStatus(url, source, Status.FAIL, type)])
            return False

        if self.sourcelinkprocessor_class:
            processor = self.sourcelinkprocessor_class()
            url = processor.process(url)

        if url is None:
            service.report_status([LinkStatus(url, source, Status.FAIL, type)])
            return False

        if response.status == 200:
            if self.name.startswith('update.'):
                pass
                service.report_update_status(
                    [LinkStatus(url, source, Status.SUCCEED, type)])
            else:
                service.report_status(
                    [LinkStatus(url, source, Status.SUCCEED, type)])
            return True
        else:
            if self.name.startswith('update.'):
                pass
                service.report_update_status(
                    [LinkStatus(url, source, Status.SUCCEED, type)])
            else:
                service.report_status(
                    [LinkStatus(url, source, Status.FAIL, type)])
            return False
Exemple #16
0
def remove_app(source_link, name):
    service.report_status(
        [LinkStatus(source_link, name, Status.FAIL, LinkType.UNKNOWN)])
Exemple #17
0
def remove_app(source_link, name):
    service.report_status([LinkStatus(source_link, name, Status.FAIL, LinkType.UNKNOWN)])