Esempio n. 1
0
    def process_item(self, item, spider):
        if item.__class__ == AppIdentificationItem:
            obj, created = AppIdentification.objects.get_or_create(
                apk_name=item['apk_name'])
            if 'top_type' in item and (item['top_type'] != obj.top_type):
                obj.top_type = item['top_type']
                obj.save()
            if 'category' in item:
                cat, is_created = Category.objects.get_or_create(
                    name=item['category'], top_type=item['top_type'])
            if created:
                appinfo = AppInfo(app_id=obj, data_source=item['data_source'])
                appinfo.save()
                log.msg('Get new apk %s' % obj.apk_name, level=log.INFO)
                return item
            else:
                spider.log('Duplicate apk %s' % obj.apk_name, level=log.INFO)
                return

        if item.__class__ == AppInfoItem:
            app = item['instance']
            # 基本信息
            for key in APK_DETAILS_FILED_NAMES:
                setattr(app, key, item[key])
            app.is_crawled = 1
            app.last_crawl_time = datetime.now()
            app.save()
            # 相关信息
            update_app_related(app, item)
            spider.log('update ok %s' % item['apk_name'], log.INFO)
            # sync data to Doraemon
            url = "%s/?apk_name=%s&force=%s" % (
                self.crawler.settings['DATA_SYNC_API'], app.app_id.apk_name,
                self.crawler.settings.get('FORCE_UPDATE'))
            # 返回defer, 同步到Doraemon
            request = Request(url=url)
            request.callback = None
            request.errback = None
            dfd = self.crawler.engine.download(request, spider)
            dfd.addCallbacks(callback=self._sync_callback,
                             callbackArgs=(item['apk_name'], spider),
                             errback=self._sync_errback,
                             errbackArgs=(item['apk_name'], spider))
            dfd.addErrback(spider.log, level=log.ERROR)
            return dfd.addBoth(lambda _: item)
Esempio n. 2
0
    def process_item(self, item, spider):
        if item.__class__ == AppIdentificationItem:
            obj, created = AppIdentification.objects.get_or_create(
                apk_name=item['apk_name']
            )
            if 'top_type' in item and (item['top_type'] != obj.top_type):
                obj.top_type = item['top_type']
                obj.save()
            if 'category' in item:
                cat, is_created = Category.objects.get_or_create(name=item['category'], top_type=item['top_type'])
            if created:
                appinfo = AppInfo(app_id=obj, data_source=item['data_source'])
                appinfo.save()
                log.msg('Get new apk %s' % obj.apk_name, level=log.INFO)
                return item
            else:
                spider.log('Duplicate apk %s' % obj.apk_name, level=log.INFO)
                return

        if item.__class__ == AppInfoItem:
            app = item['instance']
            # 基本信息
            for key in APK_DETAILS_FILED_NAMES:
                setattr(app, key, item[key])
            app.is_crawled = 1
            app.last_crawl_time = datetime.now()
            app.save()
            # 相关信息
            update_app_related(app, item)
            spider.log('update ok %s' % item['apk_name'], log.INFO)
            # sync data to Doraemon
            url = "%s/?apk_name=%s&force=%s" % (self.crawler.settings['DATA_SYNC_API'], app.app_id.apk_name, self.crawler.settings.get('FORCE_UPDATE'))
            # 返回defer, 同步到Doraemon
            request = Request(url=url)
            request.callback = None
            request.errback = None
            dfd = self.crawler.engine.download(request, spider)
            dfd.addCallbacks(
                callback=self._sync_callback, callbackArgs=(item['apk_name'], spider),
                errback=self._sync_errback, errbackArgs=(item['apk_name'], spider))
            dfd.addErrback(spider.log, level=log.ERROR)
            return dfd.addBoth(lambda _: item)
Esempio n. 3
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()
            self.expire_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            try:
                if 'callback' in item and item['callback'] is not None:
                    req.callback = getattr(self.spider, item['callback'])
            except AttributeError:
                self.logger.warn("Unable to find callback method")

            try:
                if 'errback' in item and item['errback'] is not None:
                    req.errback = getattr(self.spider, item['errback'])
            except AttributeError:
                self.logger.warn("Unable to find errback method")

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in list(item.keys()):
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None
Esempio n. 4
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''

        t = time.time()
        # update the redis queues every so often

        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        item = self.find_item()

        if item:
            self.logger.info(
                'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s'
                % (item["meta"]["url"] if 'meta' in item else item["url"]))
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = get_method(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = get_method(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])
            return req

        return None
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()
            self.expire_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            try:
                if 'callback' in item and item['callback'] is not None:
                    req.callback = getattr(self.spider, item['callback'])
            except AttributeError:
                self.logger.warn("Unable to find callback method")

            try:
                if 'errback' in item and item['errback'] is not None:
                    req.errback = getattr(self.spider, item['errback'])
            except AttributeError:
                self.logger.warn("Unable to find errback method")

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in list(item.keys()):
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None
Esempio n. 6
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''

        t = time.time()
        # update the redis queues every so often

        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        item = self.find_item()

        if item:
            self.logger.info(
                'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s' % (
                item["meta"]["url"] if 'meta' in item else item["url"]))
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = get_method(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = get_method(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])
            return req

        return None