def _get_request_gsid(self, request): url_gsid = get_url_query(request.url).get('gsid', '') cookies_gsid = '' meta_gsid = request.meta.get('gsid', '') referer = request.headers.get('referer', '') referer_gsid = get_url_query(referer).get('gsid', '') return url_gsid, cookies_gsid, meta_gsid, referer_gsid
def get_query_request(self, response): intime = self.intime if intime == '全部时间': return super(SogouBbsSpider, self).get_query_request(response) # noinspection PyPropertyAccess br = self.br mechanize_response = response_scrapy2mechanize(response) br.set_response(mechanize_response) br.select_form(nr=self.search_form_order) query = response.meta['query'] encoding = response.encoding query = query.encode(encoding) search_input_name = self.search_input_name.encode(encoding) br[search_input_name] = query br.submit() intime = intime.encode('utf8') query_request = br.click_link(text=intime) scrapy_request = request_mechanize2scrapy(query_request) scrapy_request.callback = self.query_callback url = scrapy_request.url query = get_url_query(url) query['num'] = 100 new_url = change_url_query(url, query) new_request = scrapy_request.replace(url=new_url) return new_request
def get_next_page_request(self, response): br = self.br mechanize_response = response_scrapy2mechanize(response) br.set_response(mechanize_response) encoding = response.encoding next_page_word = self.next_page_word.encode(encoding) next_page_link = self.get_next_page_link() try: if next_page_link: next_page_request = br.click_link(link=next_page_link) else: next_page_request = br.click_link(text=next_page_word) if next_page_request: url = response.url query = get_url_query(url) page = str(int(query.get('page', '1')) + 1) query['page'] = page url = change_url_query(url, query) scrapy_request = Request(url=url, callback=self.query_callback) return scrapy_request else: return None except LinkNotFoundError as e: return None except Exception as e: self.log('spider turn page error:%s' % e, level=log.INFO) return None
def process_request(self, request, spider): """ 当前未使用cookies """ url_gsid, cookies_gsid, meta_gsid, referer_gsid = self._get_request_gsid(request) if url_gsid or cookies_gsid: return elif meta_gsid: gsid = meta_gsid elif referer_gsid: gsid = referer_gsid else: gsid = self._choose_gsid() if not gsid: log.msg('CaibanSinaWeiboMiddleware,no gsid') return self._minus_gsid_times(gsid) raw_url = request.url query_dic = get_url_query(raw_url) query_dic['gsid'] = gsid new_url = change_url_query(raw_url, query_dic) new_meta = request.meta new_meta['gsid'] = gsid new_request = request.replace(url=new_url,meta=new_meta) return new_request
def parse_list_page(self, response): multi_xpath = '//div[@class="special-area-cont"]/div' html5_response = response_html5parse(response) hxs = HtmlXPathSelector(html5_response) multi_hxs = hxs.select(multi_xpath) query = response.meta.get('query') for hxs in multi_hxs: #//li[@class="g"][1]//h3/a/@href title = ''.join(hxs.select('.//h3//text()').extract()).strip() _video_name = ''.join(hxs.select('.//h3//span//font//text()').extract()) if _video_name != query: continue url = ''.join(hxs.select('.//h3//a/@href').extract()) _id = get_url_query(url)['id'] doc = { 'data_source': 'baidu视频搜索', 'url': url, 'title': title, 'id': _id, } list_url = response.url json_list_url = 'http://video.baidu.com/htvshowsingles/?id=%s' % _id next_request = Request(json_list_url, callback=self.parse_site_list) item = VideoZjcmItem(doc=doc, next_request=next_request, list_url=list_url, query=query, attachments=[], attachment_urls=[]) yield self.item_or_request(item)
def get_query_request(self, response): request = super(GooglePatentSpider, self).get_query_request(response) url = request.url query = get_url_query(url) query.pop('btnG') query['num'] = '100' new_url = change_url_query(url, query) new_request = request.replace(url=new_url) return new_request
def get_next_page_request(self, response): request = super(jiuzhengjiancaiCompanySpider, self).get_next_page_request(response) url = request.url query = get_url_query(url) p = query.get('p', 1) p = int(p) query['p'] = str(p + 1) url = change_url_query(url, query) request = request.replace(url=url) return request
def get_entry_request(self, query): url_query = get_url_query(self.entry_url) url_query['q'] = query.encode(self._site_default_encoding) new_url = change_url_query(self.entry_url, url_query) headers = { 'referer': 'http://china.makepolo.com/', } meta = { 'query': query, 'page_num': 0, } request = Request(url=new_url, headers=headers, callback=self.query_callback, meta=meta) return request
def get_query_request(self, response): """ 填表单,构造相应请求 """ request = super(AlibabaCompanySpider, self).get_query_request(response) encoding = response.encoding url = request.url query = get_url_query(url) query['province'] = self.province.encode(encoding) query['city'] = self.city.encode(encoding) query['filt'] = b'y' query.pop('button_click', None) new_url = change_url_query(url, query) request = request.replace(url=new_url) return request
def caiban_sina_weibo_login(user, pw): encoding = 'utf8' br = get_br() try: br.open('http://weibo.cn/') br.follow_link(text='登录'.encode(encoding)) except Exception as e: return '', str(e) form = list(br.forms())[0] user_control = form.controls[0] pw_control = form.controls[1] # remember_control = form.controls[2] user_control.value = user.encode(encoding) pw_control.value = pw.encode(encoding) #default is on #remember_control.value = ['on',] try: br.open(form.click()) except Exception as e: return '', str(e) url = br.geturl() gsid = get_url_query(url).get('gsid', '') # url = 'http://weibo.cn/?gsid=%(gsid)s&vt=4' % {'gsid': gsid} # br.open(url) content = br.response().read().decode(encoding, 'ignore') if content.find('请输入图片中的字符')!=-1: reason = 'yzm' elif content.find('您的微博帐号出现异常被暂时冻结')!=-1: reason = 'freeze' elif content.find('@我的')!=-1: reason = 'success' elif content.find('登录名或密码错误')!=-1: reason = 'auth fail' elif url.find('http://login.weibo.cn/login')!=-1: reason = 'fail' elif url.find('http://weibo.cn/pub/')!=-1: reason = 'redirect' else: reason = 'unknown' return gsid, reason
def parse_list_page(self, response): """ 商店名:抓取 性质:计算 来源:计算 类别:抓取? 店主:抓取 地址:抓取 联系方式:抓取 创店日期:抓取 主营产品:抓取 状态:? 操作:不需要 shop_info = {} shop_info['shop_type_id'] = 10 shop_info['shop_name'] = self.company shop_info['shop_address'] = self.address if not self.address: shop_info['shop_address'] = '山东淄博' shop_info['shop_contacts'] = self.contact shop_info['shop_phone'] = self.phone shop_info['shop_products'] = self.keywords shop_info['shop_site_url'] = self.site_url shop_info['shop_site_url_hash'] = fnvhash.fnv_64a_str(self.site_url) shop_info['shop_site_type'] = 24 shop_info['shop_certified'] = 1 shop_info['shop_owner_type'] = 1 company_key = ['厂','站','公司', '事务所', '集团'] for item in company_key: if item in self.company: shop_info['shop_owner_type'] = 2 """ multi_xpath = '//*[@id="sw_mod_searchlist"]/ul/li' html5_response = response_html5parse(response) page_hxs = HtmlXPathSelector(html5_response) multi_hxs = page_hxs.select(multi_xpath) for hxs in multi_hxs: #shop_products为shop相关所有描述.包括主营产品,简要描述和细节描述 #提取部分 shop_name = ''.join(hxs.select('.//a[@class="sw-ui-font-title14"]//text()').extract()) shop_address = ''.join(hxs.select('.//div[@class="sm-offerResult-address"]/a/text()').extract()) #主营产品: 有机化工原料; 苯 醇 酯 醚 类 批发 #这是部分主营产品,所有主营产品在主营产品 链接里的标题里面 shop_part_products = ''.join(hxs.select('.//div[@class="sm-offerResult-sale"]//text()').extract()) shop_brief = ''.join(hxs.select('.//div[@class="sm-offerResult-sub"]//text()').extract()) creditdetail_url = ''.join(hxs.select('.//a[@class="sw-ui-font-title14"]/@href').extract()) creditdetail_url = urllib.unquote(creditdetail_url).strip() #计算部分 shop_products = shop_brief + shop_part_products creditdetail_url_query = get_url_query(creditdetail_url) creditdetail_url_query.pop('tracelog', None) creditdetail_url = change_url_query(creditdetail_url, creditdetail_url_query) shop_site_url = get_site_url(creditdetail_url) shop_owner_type = calc_shop_owner_type(shop_name) shop_site_url_hash = fnvhash.fnv_64a_str(shop_site_url) #无对应数据部分 shop_qq = None shop_email = None lack_doc = { 'shop_qq': shop_qq, 'shop_email': shop_email, } #默认填充部分 shop_type_id = None shop_area_id = None shop_site_type = self.shop_site_type shop_certified = 1 city_id = 1 is_bad_url = 0 is_bad_time = None deleted = 0 isRead = 0 isImport = 0 default_doc = { 'shop_type_id': shop_type_id, 'shop_area_id': shop_area_id, 'shop_site_type': shop_site_type, 'shop_certified': shop_certified, 'city_id': city_id, 'is_bad_url': is_bad_url, 'is_bad_time': is_bad_time, 'deleted': deleted, 'isRead': isRead, 'isImport': isImport, } now = datetime.datetime.utcnow() doc = { 'shop_name': shop_name, 'shop_address': shop_address, 'shop_products': shop_products, 'shop_site_url': shop_site_url, 'shop_site_url_hash': shop_site_url_hash, 'show_owner_type': shop_owner_type, 'crawl_time': now, } doc.update(lack_doc) doc.update(default_doc) detail_url = creditdetail_url list_url = response.url query = response.meta.get('query') item = LegItem(collection=self.collection, doc=doc, detail_url=detail_url, list_url=list_url, query=query) if detail_url and self.visit_detail: detail_request = Request(detail_url, callback=self.parse_detail_page) detail_request.meta['item'] = item detail_request.meta['query'] = query yield detail_request else: yield item
def _get_user_id_from_url(self, url): return get_url_query(url).get('uid', '')