コード例 #1
0
ファイル: utils.py プロジェクト: hackrole/scrapy-utils
 def _get_request_gsid(self, request):
     url_gsid = get_url_query(request.url).get('gsid', '')
     cookies_gsid = ''
     meta_gsid = request.meta.get('gsid', '')
     referer = request.headers.get('referer', '')
     referer_gsid = get_url_query(referer).get('gsid', '')
     return url_gsid, cookies_gsid, meta_gsid, referer_gsid
コード例 #2
0
ファイル: sogou_bbs.py プロジェクト: hackrole/scrapy-utils
 def get_query_request(self, response):
     intime = self.intime
     if intime == '全部时间':
         return super(SogouBbsSpider, self).get_query_request(response)
     # noinspection PyPropertyAccess
     br = self.br
     mechanize_response = response_scrapy2mechanize(response)
     br.set_response(mechanize_response)
     br.select_form(nr=self.search_form_order)
     query = response.meta['query']
     encoding = response.encoding
     query = query.encode(encoding)
     search_input_name = self.search_input_name.encode(encoding)
     br[search_input_name] = query
     br.submit()
     intime = intime.encode('utf8')
     query_request = br.click_link(text=intime)
     scrapy_request = request_mechanize2scrapy(query_request)
     scrapy_request.callback = self.query_callback
     url = scrapy_request.url
     query = get_url_query(url)
     query['num'] = 100
     new_url = change_url_query(url, query)
     new_request = scrapy_request.replace(url=new_url)
     return new_request
コード例 #3
0
 def get_next_page_request(self, response):
     br = self.br
     mechanize_response = response_scrapy2mechanize(response)
     br.set_response(mechanize_response)
     encoding = response.encoding
     next_page_word = self.next_page_word.encode(encoding)
     next_page_link = self.get_next_page_link()
     try:
         if next_page_link:
             next_page_request = br.click_link(link=next_page_link)
         else:
             next_page_request = br.click_link(text=next_page_word)
         if next_page_request:
             url = response.url
             query = get_url_query(url)
             page = str(int(query.get('page', '1')) + 1)
             query['page'] = page
             url = change_url_query(url, query)
             scrapy_request = Request(url=url, callback=self.query_callback)
             return scrapy_request
         else:
             return None
     except LinkNotFoundError as e:
         return None
     except Exception as e:
         self.log('spider turn page error:%s' % e, level=log.INFO)
         return None
コード例 #4
0
ファイル: utils.py プロジェクト: hackrole/scrapy-utils
 def process_request(self, request, spider):
     """
     当前未使用cookies
     """
     url_gsid, cookies_gsid, meta_gsid, referer_gsid = self._get_request_gsid(request)
     if url_gsid or cookies_gsid:
         return
     elif meta_gsid:
         gsid = meta_gsid
     elif referer_gsid:
         gsid = referer_gsid
     else:
         gsid = self._choose_gsid()
     if not gsid:
         log.msg('CaibanSinaWeiboMiddleware,no gsid')
         return
     self._minus_gsid_times(gsid)
     raw_url = request.url
     query_dic = get_url_query(raw_url)
     query_dic['gsid'] = gsid
     new_url = change_url_query(raw_url, query_dic)
     new_meta = request.meta
     new_meta['gsid'] = gsid
     new_request = request.replace(url=new_url,meta=new_meta)
     return new_request
コード例 #5
0
ファイル: baidu_video.py プロジェクト: hackrole/scrapy-utils
    def parse_list_page(self, response):

        multi_xpath = '//div[@class="special-area-cont"]/div'
        html5_response = response_html5parse(response)
        hxs = HtmlXPathSelector(html5_response)
        multi_hxs = hxs.select(multi_xpath)
        query = response.meta.get('query')
        for hxs in multi_hxs:
            #//li[@class="g"][1]//h3/a/@href
            title = ''.join(hxs.select('.//h3//text()').extract()).strip()
            _video_name = ''.join(hxs.select('.//h3//span//font//text()').extract())
            if _video_name != query:
                continue
            url = ''.join(hxs.select('.//h3//a/@href').extract())
            _id = get_url_query(url)['id']
            doc = {
                'data_source': 'baidu视频搜索',
                'url': url,
                'title': title,
                'id': _id,
            }
            list_url = response.url
            json_list_url = 'http://video.baidu.com/htvshowsingles/?id=%s' % _id
            next_request = Request(json_list_url, callback=self.parse_site_list)
            item = VideoZjcmItem(doc=doc,
                                 next_request=next_request, list_url=list_url, query=query,
                                 attachments=[], attachment_urls=[])
            yield self.item_or_request(item)
コード例 #6
0
 def get_query_request(self, response):
     request = super(GooglePatentSpider, self).get_query_request(response)
     url = request.url
     query = get_url_query(url)
     query.pop('btnG')
     query['num'] = '100'
     new_url = change_url_query(url, query)
     new_request = request.replace(url=new_url)
     return new_request
コード例 #7
0
 def get_next_page_request(self, response):
     request = super(jiuzhengjiancaiCompanySpider, self).get_next_page_request(response)
     url = request.url
     query = get_url_query(url)
     p = query.get('p', 1)
     p = int(p)
     query['p'] = str(p + 1)
     url = change_url_query(url, query)
     request = request.replace(url=url)
     return request
コード例 #8
0
 def get_entry_request(self, query):
     url_query = get_url_query(self.entry_url)
     url_query['q'] = query.encode(self._site_default_encoding)
     new_url = change_url_query(self.entry_url, url_query)
     headers = {
         'referer': 'http://china.makepolo.com/',
     }
     meta = {
         'query': query,
         'page_num': 0,
     }
     request = Request(url=new_url, headers=headers, callback=self.query_callback, meta=meta)
     return request
コード例 #9
0
 def get_query_request(self, response):
     """
     填表单,构造相应请求
     """
     request = super(AlibabaCompanySpider, self).get_query_request(response)
     encoding = response.encoding
     url = request.url
     query = get_url_query(url)
     query['province'] = self.province.encode(encoding)
     query['city'] = self.city.encode(encoding)
     query['filt'] = b'y'
     query.pop('button_click', None)
     new_url = change_url_query(url, query)
     request = request.replace(url=new_url)
     return request
コード例 #10
0
ファイル: utils.py プロジェクト: hackrole/scrapy-utils
def caiban_sina_weibo_login(user, pw):
    encoding = 'utf8'
    br = get_br()
    try:
        br.open('http://weibo.cn/')
        br.follow_link(text='登录'.encode(encoding))
    except Exception as e:
        return '', str(e)
    form = list(br.forms())[0]
    user_control = form.controls[0]
    pw_control = form.controls[1]
    # remember_control = form.controls[2]
    user_control.value = user.encode(encoding)
    pw_control.value = pw.encode(encoding)
    #default is on
    #remember_control.value = ['on',]
    try:
        br.open(form.click())
    except Exception as e:
        return '', str(e)
    url = br.geturl()
    gsid = get_url_query(url).get('gsid', '')
    # url = 'http://weibo.cn/?gsid=%(gsid)s&vt=4' % {'gsid': gsid}
    # br.open(url)
    content = br.response().read().decode(encoding, 'ignore')
    if content.find('请输入图片中的字符')!=-1:
        reason = 'yzm'
    elif content.find('您的微博帐号出现异常被暂时冻结')!=-1:
        reason = 'freeze'
    elif content.find('@我的')!=-1:
        reason = 'success'
    elif content.find('登录名或密码错误')!=-1:
        reason = 'auth fail'
    elif url.find('http://login.weibo.cn/login')!=-1:
        reason = 'fail'
    elif url.find('http://weibo.cn/pub/')!=-1:
        reason = 'redirect'
    else:
        reason = 'unknown'
    return gsid, reason
コード例 #11
0
    def parse_list_page(self, response):
        """
        商店名:抓取
        性质:计算
        来源:计算
        类别:抓取?
        店主:抓取
        地址:抓取
        联系方式:抓取
        创店日期:抓取
        主营产品:抓取
        状态:?
        操作:不需要
        shop_info = {}
        shop_info['shop_type_id'] = 10
        shop_info['shop_name'] = self.company
        shop_info['shop_address'] = self.address
        if not self.address:
            shop_info['shop_address'] = '山东淄博'
        shop_info['shop_contacts'] = self.contact
        shop_info['shop_phone'] = self.phone
        shop_info['shop_products'] = self.keywords
        shop_info['shop_site_url'] = self.site_url
        shop_info['shop_site_url_hash'] = fnvhash.fnv_64a_str(self.site_url)
        shop_info['shop_site_type'] = 24
        shop_info['shop_certified'] = 1
        shop_info['shop_owner_type'] = 1
        company_key = ['厂','站','公司', '事务所', '集团']
        for item in company_key:
            if item in self.company:
                shop_info['shop_owner_type'] = 2
        """


        multi_xpath = '//*[@id="sw_mod_searchlist"]/ul/li'
        html5_response = response_html5parse(response)
        page_hxs = HtmlXPathSelector(html5_response)
        multi_hxs = page_hxs.select(multi_xpath)
        for hxs in multi_hxs:
            #shop_products为shop相关所有描述.包括主营产品,简要描述和细节描述
            #提取部分
            shop_name = ''.join(hxs.select('.//a[@class="sw-ui-font-title14"]//text()').extract())
            shop_address = ''.join(hxs.select('.//div[@class="sm-offerResult-address"]/a/text()').extract())
            #主营产品: 有机化工原料; 苯 醇 酯 醚 类 批发
            #这是部分主营产品,所有主营产品在主营产品 链接里的标题里面
            shop_part_products = ''.join(hxs.select('.//div[@class="sm-offerResult-sale"]//text()').extract())
            shop_brief = ''.join(hxs.select('.//div[@class="sm-offerResult-sub"]//text()').extract())
            creditdetail_url = ''.join(hxs.select('.//a[@class="sw-ui-font-title14"]/@href').extract())
            creditdetail_url = urllib.unquote(creditdetail_url).strip()
            #计算部分
            shop_products = shop_brief + shop_part_products
            creditdetail_url_query = get_url_query(creditdetail_url)
            creditdetail_url_query.pop('tracelog', None)
            creditdetail_url = change_url_query(creditdetail_url, creditdetail_url_query)
            shop_site_url = get_site_url(creditdetail_url)
            shop_owner_type = calc_shop_owner_type(shop_name)
            shop_site_url_hash = fnvhash.fnv_64a_str(shop_site_url)
            #无对应数据部分
            shop_qq = None
            shop_email = None
            lack_doc = {
                'shop_qq': shop_qq,
                'shop_email': shop_email,
            }
            #默认填充部分
            shop_type_id = None
            shop_area_id = None
            shop_site_type = self.shop_site_type
            shop_certified = 1
            city_id = 1
            is_bad_url = 0
            is_bad_time = None
            deleted = 0
            isRead = 0
            isImport = 0
            default_doc = {
                'shop_type_id': shop_type_id,
                'shop_area_id': shop_area_id,
                'shop_site_type': shop_site_type,
                'shop_certified': shop_certified,
                'city_id': city_id,
                'is_bad_url': is_bad_url,
                'is_bad_time': is_bad_time,
                'deleted': deleted,
                'isRead': isRead,
                'isImport': isImport,
            }
            now = datetime.datetime.utcnow()

            doc = {
                'shop_name': shop_name,
                'shop_address': shop_address,
                'shop_products': shop_products,
                'shop_site_url': shop_site_url,
                'shop_site_url_hash': shop_site_url_hash,
                'show_owner_type': shop_owner_type,
                'crawl_time': now,
            }
            doc.update(lack_doc)
            doc.update(default_doc)

            detail_url = creditdetail_url
            list_url = response.url
            query = response.meta.get('query')
            item = LegItem(collection=self.collection, doc=doc,
                              detail_url=detail_url, list_url=list_url, query=query)
            if detail_url and self.visit_detail:
                detail_request = Request(detail_url, callback=self.parse_detail_page)
                detail_request.meta['item'] = item
                detail_request.meta['query'] = query
                yield detail_request
            else:
                yield item
コード例 #12
0
 def _get_user_id_from_url(self, url):
     return get_url_query(url).get('uid', '')