def _construct_query(self, page_num, query): url = 'http://www.innojoy.com/client/interface.aspx' data = {"requestModule": "PatentSearch", "userId": "", "patentSearchConfig": { "Query": query, "TreeQuery": "", "Database": "idpat,mypat,phpat,sgpat,itpat,inpat,inapp,chpat,frpat,gbpat,depat,jpapp,eppat,wopat,usapp,usdes,uspp,usre,uspat,fmsq,wgzl,syxx,fmzl", "Action": "Search", "Page": str(page_num), "PageSize": self._page_size, "GUID": "", "Sortby": "", "AddOnes": "", "DelOnes": "", "RemoveOnes": "", "TrsField": "", "SmartSearch": "" } } data_bin = json.dumps(data) headers = { 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'http://www.innojoy.com/SearchResult/default.shtml', } request = Request(url=url, method='post', headers=headers, body=data_bin) # noinspection PyUnresolvedReferences request.callback = self.query_callback return request
def get_next_page_request(self, response): sleep_time = self.crawler.settings.get('DOWNLOAD_DELAY',1) time.sleep(sleep_time) request_data = response.request.body data = json.loads(request_data) page_number = data.get('PageNumber',None) if not page_number: return page_number = int(page_number) + 1 data['PageNumber'] = page_number referer = response.request.headers['Referer'] url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult' # url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult' headers = { 'Content-Type': 'application/json; charset=UTF-8', 'Referer': referer, 'Cookie': get_cookie(), } _response = requests.post(url, data=json.dumps(data),headers=headers) try: result = _response.json() if result['d'][0] is not None: immediate_response = response_requests2scrapy(_response) meta = { 'immediate_response': immediate_response } request = Request(url,headers=headers,method='POST',meta=meta) # noinspection PyUnresolvedReferences request.callback = self.query_callback return request except Exception as e: log.msg('spider turn page error:%s' % str(e), level=log.INFO) return None
def start_requests(): for url in start_urls: req = Request(url, dont_filter=True) if isinstance( url, basestring) else url if callback is not None: req.callback = callback yield req
def get_query_request(self, response): # http://s.wanfangdata.com.cn/Patent.aspx?q=%E4%BD%A0%E5%A5%BD&f=c.Patent encoding = response.encoding query = response.meta["query"] query = query.encode(encoding) url = b"http://s.wanfangdata.com.cn/Patent.aspx?q=%s&f=c.Patent" % query request = Request(url) # noinspection PyUnresolvedReferences request.callback = self.query_callback return request
def next_request(self): ''' Logic to handle getting a new url request ''' t = time.time() item = self.find_item() if item: try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) #add callback if item.get("callback"): next_func = item['callback'] else: next_func = 'parse' req.callback = getattr(self.spider,next_func) if "meta" in item: item = item.get('meta') # defaults if "attrs" not in item: item["attrs"] = {} if "allowed_domains" not in item: item["allowed_domains"] = () if "allow_regex" not in item: item["allow_regex"] = () if "deny_regex" not in item: item["deny_regex"] = () if "deny_extensions" not in item: item["deny_extensions"] = None if 'curdepth' not in item: item['curdepth'] = 0 if "maxdepth" not in item: item["maxdepth"] = 0 if "priority" not in item: item['priority'] = 0 if "retry_times" not in item: item['retry_times'] = 0 if "expires" not in item: item['expires'] = 0 for key in ('attrs', 'allowed_domains', 'curdepth', 'maxdepth', 'appid', 'crawlid', 'spiderid', 'priority', 'retry_times', 'expires', 'allow_regex', 'deny_regex', 'deny_extensions'): req.meta[key] = item[key] if item.get("pd"): req.meta['pd'] = item['pd'] return req return None
def process_item(self, item, spider): if item.__class__ == AppIdentificationItem: obj, created = AppIdentification.objects.get_or_create( apk_name=item['apk_name']) if 'top_type' in item and (item['top_type'] != obj.top_type): obj.top_type = item['top_type'] obj.save() if 'category' in item: cat, is_created = Category.objects.get_or_create( name=item['category'], top_type=item['top_type']) if created: appinfo = AppInfo(app_id=obj, data_source=item['data_source']) appinfo.save() log.msg('Get new apk %s' % obj.apk_name, level=log.INFO) return item else: spider.log('Duplicate apk %s' % obj.apk_name, level=log.INFO) return if item.__class__ == AppInfoItem: app = item['instance'] # 基本信息 for key in APK_DETAILS_FILED_NAMES: setattr(app, key, item[key]) app.is_crawled = 1 app.last_crawl_time = datetime.now() app.save() # 相关信息 update_app_related(app, item) spider.log('update ok %s' % item['apk_name'], log.INFO) # sync data to Doraemon url = "%s/?apk_name=%s&force=%s" % ( self.crawler.settings['DATA_SYNC_API'], app.app_id.apk_name, self.crawler.settings.get('FORCE_UPDATE')) # 返回defer, 同步到Doraemon request = Request(url=url) request.callback = None request.errback = None dfd = self.crawler.engine.download(request, spider) dfd.addCallbacks(callback=self._sync_callback, callbackArgs=(item['apk_name'], spider), errback=self._sync_errback, errbackArgs=(item['apk_name'], spider)) dfd.addErrback(spider.log, level=log.ERROR) return dfd.addBoth(lambda _: item)
def process_item(self, item, spider): if item.__class__ == AppIdentificationItem: obj, created = AppIdentification.objects.get_or_create( apk_name=item['apk_name'] ) if 'top_type' in item and (item['top_type'] != obj.top_type): obj.top_type = item['top_type'] obj.save() if 'category' in item: cat, is_created = Category.objects.get_or_create(name=item['category'], top_type=item['top_type']) if created: appinfo = AppInfo(app_id=obj, data_source=item['data_source']) appinfo.save() log.msg('Get new apk %s' % obj.apk_name, level=log.INFO) return item else: spider.log('Duplicate apk %s' % obj.apk_name, level=log.INFO) return if item.__class__ == AppInfoItem: app = item['instance'] # 基本信息 for key in APK_DETAILS_FILED_NAMES: setattr(app, key, item[key]) app.is_crawled = 1 app.last_crawl_time = datetime.now() app.save() # 相关信息 update_app_related(app, item) spider.log('update ok %s' % item['apk_name'], log.INFO) # sync data to Doraemon url = "%s/?apk_name=%s&force=%s" % (self.crawler.settings['DATA_SYNC_API'], app.app_id.apk_name, self.crawler.settings.get('FORCE_UPDATE')) # 返回defer, 同步到Doraemon request = Request(url=url) request.callback = None request.errback = None dfd = self.crawler.engine.download(request, spider) dfd.addCallbacks( callback=self._sync_callback, callbackArgs=(item['apk_name'], spider), errback=self._sync_errback, errbackArgs=(item['apk_name'], spider)) dfd.addErrback(spider.log, level=log.ERROR) return dfd.addBoth(lambda _: item)
def _construct_full_text(self, dn, db): url = 'http://www.innojoy.com/client/interface.aspx' headers = { #'Accept': '*/*', 'Content-Type': 'application/x-www-form-urlencoded; charset=UTF-8', 'Referer': 'http://www.innojoy.com/SearchResult/default.shtml', } data = {"requestModule": "PatentSearch", "patentSearchConfig": { "Query": "DN=%s" % dn, "Database": db, "PageSize": 1, "RecordIndex": 0, "Action": "loadFullText" } } data_bin = json.dumps(data) request = Request(url=url, method='post', headers=headers, body=data_bin) # noinspection PyUnresolvedReferences request.callback = self.parse_full_text return request
def get_query_request(self, response): query = response.meta['query'] default_encoding = self._site_default_encoding encoding = default_encoding if default_encoding else response.encoding query = query.encode(encoding) _kw = urllib.quote(query,safe=b':=&()?') _query = urllib.quote(b" (%(query)s/AB+%(query)s/CL+%(query)s/TI+%(query)s/IN+%(query)s/PA)" % {'query':query},safe=b'()') # referer = b'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.html?No=999&kw=%s&Query=%s'%(_kw,_query) referer = b'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.html?No=999&kw=%s&Nm=23&errorTips=&Query=%s'%(_kw,_query) # _kw = query # _query = b" (%(query)s/AB+%(query)s/CL+%(query)s/TI+%(query)s/IN+%(query)s/PA)" % {'query':query} # referer = b'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.html?No=999&kw=%s&Nm=23&errorTips=&Query=%s'%(_kw,_query) # url_query = get_url_query(referer) # for k,v in url_query.items(): # url_query[k] = urllib.quote(v,safe=b':=&()?') # new_query = '' # new_query = '&'.join('%s=%s'%(k,v) for k,v in url_query.items()) # up_result = urlparse(referer) # up_result.query = new_query # referer = urlunparse(up_result) # referer = change_url_query(referer,url_query) # referer = urllib.quote(referer,safe=b':/=&()?') headers = { 'Accept':'application/json, text/javascript, */*', 'Content-Type': 'application/json; charset=UTF-8', 'Referer': referer, # 'Cookie': get_cookie(), 'X-Requested-With':'XMLHttpRequest', } data=b"{'PageSize':'10', 'PageNumber':'1','_strSearchNo':'999'}" url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult' # url = 'http://searchtel.patentstar.com.cn/CPRS2010/cn/PatentGeneralList.aspx/GetXmlResult' request = Request(url,method='POST',headers=headers,body=data) # noinspection PyUnresolvedReferences request.callback = self.query_callback return request
def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() self.expire_queues() # update the ip address every so often if t - self.update_ip_time > self.ip_update_interval: self.update_ip_time = t self.update_ipaddress() self.report_self() item = self.find_item() if item: self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) try: if 'callback' in item and item['callback'] is not None: req.callback = getattr(self.spider, item['callback']) except AttributeError: self.logger.warn("Unable to find callback method") try: if 'errback' in item and item['errback'] is not None: req.errback = getattr(self.spider, item['errback']) except AttributeError: self.logger.warn("Unable to find errback method") if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in list(item.keys()): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None
def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() item = self.find_item() if item: self.logger.info( 'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s' % (item["meta"]["url"] if 'meta' in item else item["url"])) self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = get_method(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = get_method(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None
req.meta['_depth'] = depth + 1 req.meta['_callback'] = req.callback req.callback = callback return requests # update request meta if any extra meta was passed through the --meta/-m opts. if opts.meta: request.meta.update(opts.meta) # update cb_kwargs if any extra values were was passed through the --cbkwargs option. if opts.cbkwargs: request.cb_kwargs.update(opts.cbkwargs) request.meta['_depth'] = 1 request.meta['_callback'] = request.callback request.callback = callback return request def process_options(self, args, opts): ScrapyCommand.process_options(self, args, opts) self.process_spider_arguments(opts) self.process_request_meta(opts) self.process_request_cb_kwargs(opts) def process_spider_arguments(self, opts): try: opts.spargs = arglist_to_dict(opts.spargs) except ValueError: raise UsageError("Invalid -a value, use -a NAME=VALUE", print_help=False)
def start_requests(): for url in start_urls: req = Request(url, dont_filter=True) if isinstance(url, basestring) else url if callback is not None: req.callback = callback yield req
def next_request(self): ''' Logic to handle getting a new url request, from a bunch of different queues ''' t = time.time() # update the redis queues every so often if t - self.update_time > self.update_interval: self.update_time = t self.create_queues() item = self.find_item() if item: self.logger.info( 'distributed_scheduler.py::DistributedScheduler::next_request call find_item() result is : %s' % ( item["meta"]["url"] if 'meta' in item else item["url"])) self.logger.debug("Found url to crawl {url}" \ .format(url=item['url'])) try: req = Request(item['url']) except ValueError: # need absolute url # need better url validation here req = Request('http://' + item['url']) if 'callback' in item: cb = item['callback'] if cb and self.spider: cb = get_method(self.spider, cb) req.callback = cb if 'errback' in item: eb = item['errback'] if eb and self.spider: eb = get_method(self.spider, eb) req.errback = eb if 'meta' in item: item = item['meta'] # defaults not in schema if 'curdepth' not in item: item['curdepth'] = 0 if "retry_times" not in item: item['retry_times'] = 0 for key in item.keys(): req.meta[key] = item[key] # extra check to add items to request if 'useragent' in item and item['useragent'] is not None: req.headers['User-Agent'] = item['useragent'] if 'cookie' in item and item['cookie'] is not None: if isinstance(item['cookie'], dict): req.cookies = item['cookie'] elif isinstance(item['cookie'], basestring): req.cookies = self.parse_cookie(item['cookie']) return req return None