Example #1
1
 def _construct_query(self, page_num, query):
     url = 'http://www.innojoy.com/client/interface.aspx'
     data = {"requestModule": "PatentSearch",
             "userId": "",
             "patentSearchConfig": {
                 "Query": query,
                 "TreeQuery": "",
                 "Database": "idpat,mypat,phpat,sgpat,itpat,inpat,inapp,chpat,frpat,gbpat,depat,jpapp,eppat,wopat,usapp,usdes,uspp,usre,uspat,fmsq,wgzl,syxx,fmzl",
                 "Action": "Search",
                 "Page": str(page_num),
                 "PageSize": self._page_size,
                 "GUID": "",
                 "Sortby": "",
                 "AddOnes": "",
                 "DelOnes": "",
                 "RemoveOnes": "",
                 "TrsField": "",
                 "SmartSearch": ""
             }
     }
     data_bin = json.dumps(data)
     headers = {
         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
         'Referer': 'http://www.innojoy.com/SearchResult/default.shtml',
     }
     request = Request(url=url, method='post', headers=headers, body=data_bin)
     # noinspection PyUnresolvedReferences
     request.callback = self.query_callback
     return request
Example #2
0
 def get_next_page_request(self, response):
     sleep_time = self.crawler.settings.get('DOWNLOAD_DELAY',1)
     time.sleep(sleep_time)
     request_data = response.request.body
     data = json.loads(request_data)
     page_number = data.get('PageNumber',None)
     if not page_number:
         return
     page_number = int(page_number) + 1
     data['PageNumber'] = page_number
     referer = response.request.headers['Referer']
     url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult'
     # url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult'
     headers = {
         'Content-Type': 'application/json; charset=UTF-8',
         'Referer': referer,
         'Cookie': get_cookie(),
     }
     _response = requests.post(url, data=json.dumps(data),headers=headers)
     try:
         result = _response.json()
         if result['d'][0] is not None:
             immediate_response = response_requests2scrapy(_response)
             meta = {
                 'immediate_response': immediate_response
             }
             request = Request(url,headers=headers,method='POST',meta=meta)
             # noinspection PyUnresolvedReferences
             request.callback = self.query_callback
             return request
     except Exception as e:
         log.msg('spider turn page error:%s' % str(e), level=log.INFO)
         return None
Example #3
0
 def start_requests():
     for url in start_urls:
         req = Request(url, dont_filter=True) if isinstance(
             url, basestring) else url
         if callback is not None:
             req.callback = callback
         yield req
Example #4
0
 def get_query_request(self, response):
     # http://s.wanfangdata.com.cn/Patent.aspx?q=%E4%BD%A0%E5%A5%BD&f=c.Patent
     encoding = response.encoding
     query = response.meta["query"]
     query = query.encode(encoding)
     url = b"http://s.wanfangdata.com.cn/Patent.aspx?q=%s&f=c.Patent" % query
     request = Request(url)
     # noinspection PyUnresolvedReferences
     request.callback = self.query_callback
     return request
Example #5
0
    def next_request(self):
        '''
        Logic to handle getting a new url request
        '''
        t = time.time()
        item = self.find_item()
        if item:
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])
            #add callback
            if item.get("callback"):
                next_func = item['callback']
            else:
                next_func = 'parse'
            req.callback = getattr(self.spider,next_func)

            if "meta" in item:
                item = item.get('meta')

            # defaults
            if "attrs" not in item:
                item["attrs"] = {}
            if "allowed_domains" not in item:
                item["allowed_domains"] = ()
            if "allow_regex" not in item:
                item["allow_regex"] = ()
            if "deny_regex" not in item:
                item["deny_regex"] = ()
            if "deny_extensions" not in item:
                item["deny_extensions"] = None
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "maxdepth" not in item:
                item["maxdepth"] = 0
            if "priority" not in item:
                item['priority'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0
            if "expires" not in item:
                item['expires'] = 0

            for key in ('attrs', 'allowed_domains', 'curdepth', 'maxdepth',
                    'appid', 'crawlid', 'spiderid', 'priority', 'retry_times',
                    'expires', 'allow_regex', 'deny_regex', 'deny_extensions'):
                req.meta[key] = item[key]

            if item.get("pd"):
                req.meta['pd'] = item['pd']
            return req

        return None
Example #6
0
    def process_item(self, item, spider):
        if item.__class__ == AppIdentificationItem:
            obj, created = AppIdentification.objects.get_or_create(
                apk_name=item['apk_name'])
            if 'top_type' in item and (item['top_type'] != obj.top_type):
                obj.top_type = item['top_type']
                obj.save()
            if 'category' in item:
                cat, is_created = Category.objects.get_or_create(
                    name=item['category'], top_type=item['top_type'])
            if created:
                appinfo = AppInfo(app_id=obj, data_source=item['data_source'])
                appinfo.save()
                log.msg('Get new apk %s' % obj.apk_name, level=log.INFO)
                return item
            else:
                spider.log('Duplicate apk %s' % obj.apk_name, level=log.INFO)
                return

        if item.__class__ == AppInfoItem:
            app = item['instance']
            # 基本信息
            for key in APK_DETAILS_FILED_NAMES:
                setattr(app, key, item[key])
            app.is_crawled = 1
            app.last_crawl_time = datetime.now()
            app.save()
            # 相关信息
            update_app_related(app, item)
            spider.log('update ok %s' % item['apk_name'], log.INFO)
            # sync data to Doraemon
            url = "%s/?apk_name=%s&force=%s" % (
                self.crawler.settings['DATA_SYNC_API'], app.app_id.apk_name,
                self.crawler.settings.get('FORCE_UPDATE'))
            # 返回defer, 同步到Doraemon
            request = Request(url=url)
            request.callback = None
            request.errback = None
            dfd = self.crawler.engine.download(request, spider)
            dfd.addCallbacks(callback=self._sync_callback,
                             callbackArgs=(item['apk_name'], spider),
                             errback=self._sync_errback,
                             errbackArgs=(item['apk_name'], spider))
            dfd.addErrback(spider.log, level=log.ERROR)
            return dfd.addBoth(lambda _: item)
Example #7
0
    def process_item(self, item, spider):
        if item.__class__ == AppIdentificationItem:
            obj, created = AppIdentification.objects.get_or_create(
                apk_name=item['apk_name']
            )
            if 'top_type' in item and (item['top_type'] != obj.top_type):
                obj.top_type = item['top_type']
                obj.save()
            if 'category' in item:
                cat, is_created = Category.objects.get_or_create(name=item['category'], top_type=item['top_type'])
            if created:
                appinfo = AppInfo(app_id=obj, data_source=item['data_source'])
                appinfo.save()
                log.msg('Get new apk %s' % obj.apk_name, level=log.INFO)
                return item
            else:
                spider.log('Duplicate apk %s' % obj.apk_name, level=log.INFO)
                return

        if item.__class__ == AppInfoItem:
            app = item['instance']
            # 基本信息
            for key in APK_DETAILS_FILED_NAMES:
                setattr(app, key, item[key])
            app.is_crawled = 1
            app.last_crawl_time = datetime.now()
            app.save()
            # 相关信息
            update_app_related(app, item)
            spider.log('update ok %s' % item['apk_name'], log.INFO)
            # sync data to Doraemon
            url = "%s/?apk_name=%s&force=%s" % (self.crawler.settings['DATA_SYNC_API'], app.app_id.apk_name, self.crawler.settings.get('FORCE_UPDATE'))
            # 返回defer, 同步到Doraemon
            request = Request(url=url)
            request.callback = None
            request.errback = None
            dfd = self.crawler.engine.download(request, spider)
            dfd.addCallbacks(
                callback=self._sync_callback, callbackArgs=(item['apk_name'], spider),
                errback=self._sync_errback, errbackArgs=(item['apk_name'], spider))
            dfd.addErrback(spider.log, level=log.ERROR)
            return dfd.addBoth(lambda _: item)
Example #8
0
 def _construct_full_text(self, dn, db):
     url = 'http://www.innojoy.com/client/interface.aspx'
     headers = {
         #'Accept': '*/*',
         'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8',
         'Referer': 'http://www.innojoy.com/SearchResult/default.shtml',
     }
     data = {"requestModule": "PatentSearch",
             "patentSearchConfig": {
                 "Query": "DN=%s" % dn,
                 "Database": db,
                 "PageSize": 1,
                 "RecordIndex": 0,
                 "Action": "loadFullText"
             }
     }
     data_bin = json.dumps(data)
     request = Request(url=url, method='post', headers=headers, body=data_bin)
     # noinspection PyUnresolvedReferences
     request.callback = self.parse_full_text
     return request
Example #9
0
 def get_query_request(self, response):
     query = response.meta['query']
     default_encoding = self._site_default_encoding
     encoding = default_encoding if default_encoding else response.encoding
     query = query.encode(encoding)
     _kw = urllib.quote(query,safe=b':=&()?')
     _query = urllib.quote(b" (%(query)s/AB+%(query)s/CL+%(query)s/TI+%(query)s/IN+%(query)s/PA)" % {'query':query},safe=b'()')
     # referer = b'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.html?No=999&kw=%s&Query=%s'%(_kw,_query)
     referer = b'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.html?No=999&kw=%s&Nm=23&errorTips=&Query=%s'%(_kw,_query)
     # _kw = query
     # _query = b" (%(query)s/AB+%(query)s/CL+%(query)s/TI+%(query)s/IN+%(query)s/PA)" % {'query':query}
     # referer = b'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.html?No=999&kw=%s&Nm=23&errorTips=&Query=%s'%(_kw,_query)
     # url_query = get_url_query(referer)
     # for k,v in url_query.items():
     #     url_query[k] = urllib.quote(v,safe=b':=&()?')
     # new_query = ''
     # new_query = '&'.join('%s=%s'%(k,v) for k,v in url_query.items())
     # up_result = urlparse(referer)
     # up_result.query = new_query
     # referer = urlunparse(up_result)
     # referer = change_url_query(referer,url_query)
     # referer = urllib.quote(referer,safe=b':/=&()?')
     headers = {
         'Accept':'application/json, text/javascript, */*',
         'Content-Type': 'application/json; charset=UTF-8',
         'Referer': referer,
         # 'Cookie': get_cookie(),
         'X-Requested-With':'XMLHttpRequest',
     }
     data=b"{'PageSize':'10', 'PageNumber':'1','_strSearchNo':'999'}"
     url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult'
     # url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult'
     request = Request(url,method='POST',headers=headers,body=data)
     # noinspection PyUnresolvedReferences
     request.callback = self.query_callback
     return request
Example #10
0
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()
            self.expire_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            try:
                if 'callback' in item and item['callback'] is not None:
                    req.callback = getattr(self.spider, item['callback'])
            except AttributeError:
                self.logger.warn("Unable to find callback method")

            try:
                if 'errback' in item and item['errback'] is not None:
                    req.errback = getattr(self.spider, item['errback'])
            except AttributeError:
                self.logger.warn("Unable to find errback method")

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in list(item.keys()):
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''

        t = time.time()
        # update the redis queues every so often

        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        item = self.find_item()

        if item:
            self.logger.info(
                'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s'
                % (item["meta"]["url"] if 'meta' in item else item["url"]))
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = get_method(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = get_method(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])
            return req

        return None
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''
        t = time.time()
        # update the redis queues every so often
        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()
            self.expire_queues()

        # update the ip address every so often
        if t - self.update_ip_time > self.ip_update_interval:
            self.update_ip_time = t
            self.update_ipaddress()
            self.report_self()

        item = self.find_item()
        if item:
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            try:
                if 'callback' in item and item['callback'] is not None:
                    req.callback = getattr(self.spider, item['callback'])
            except AttributeError:
                self.logger.warn("Unable to find callback method")

            try:
                if 'errback' in item and item['errback'] is not None:
                    req.errback = getattr(self.spider, item['errback'])
            except AttributeError:
                self.logger.warn("Unable to find errback method")

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in list(item.keys()):
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])

            return req

        return None
Example #13
0
                    req.meta['_depth'] = depth + 1
                    req.meta['_callback'] = req.callback
                    req.callback = callback
                return requests

        # update request meta if any extra meta was passed through the --meta/-m opts.
        if opts.meta:
            request.meta.update(opts.meta)

        # update cb_kwargs if any extra values were was passed through the --cbkwargs option.
        if opts.cbkwargs:
            request.cb_kwargs.update(opts.cbkwargs)

        request.meta['_depth'] = 1
        request.meta['_callback'] = request.callback
        request.callback = callback
        return request

    def process_options(self, args, opts):
        ScrapyCommand.process_options(self, args, opts)

        self.process_spider_arguments(opts)
        self.process_request_meta(opts)
        self.process_request_cb_kwargs(opts)

    def process_spider_arguments(self, opts):
        try:
            opts.spargs = arglist_to_dict(opts.spargs)
        except ValueError:
            raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
Example #14
0
 def start_requests():
     for url in start_urls:
         req = Request(url, dont_filter=True) if isinstance(url, basestring) else url
         if callback is not None:
             req.callback = callback
         yield req
    def next_request(self):
        '''
        Logic to handle getting a new url request, from a bunch of
        different queues
        '''

        t = time.time()
        # update the redis queues every so often

        if t - self.update_time > self.update_interval:
            self.update_time = t
            self.create_queues()

        item = self.find_item()

        if item:
            self.logger.info(
                'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s' % (
                item["meta"]["url"] if 'meta' in item else item["url"]))
            self.logger.debug("Found url to crawl {url}" \
                    .format(url=item['url']))
            try:
                req = Request(item['url'])
            except ValueError:
                # need absolute url
                # need better url validation here
                req = Request('http://' + item['url'])

            if 'callback' in item:
                cb = item['callback']
                if cb and self.spider:
                    cb = get_method(self.spider, cb)
                    req.callback = cb

            if 'errback' in item:
                eb = item['errback']
                if eb and self.spider:
                    eb = get_method(self.spider, eb)
                    req.errback = eb

            if 'meta' in item:
                item = item['meta']

            # defaults not in schema
            if 'curdepth' not in item:
                item['curdepth'] = 0
            if "retry_times" not in item:
                item['retry_times'] = 0

            for key in item.keys():
                req.meta[key] = item[key]

            # extra check to add items to request
            if 'useragent' in item and item['useragent'] is not None:
                req.headers['User-Agent'] = item['useragent']
            if 'cookie' in item and item['cookie'] is not None:
                if isinstance(item['cookie'], dict):
                    req.cookies = item['cookie']
                elif isinstance(item['cookie'], basestring):
                    req.cookies = self.parse_cookie(item['cookie'])
            return req

        return None