Python HtmlXPathSelector.re Examples, scrapy.selector.HtmlXPathSelector.re Python Examples

Example #1

0

Show file

File: video_detail.py Project: hackrole/scrapy-utils

    def parse_sina_com_cn(self, response):
        hxs = HtmlXPathSelector(response)

        vid = hxs.re('vid:.*?(\d+)\|\d+')[0]
        nid = hxs.re("newsid:'([-\w]+)")[0]
        url_t = "http://count.kandian.com/getCount.php?vids=%s&action=flash"
        url = url_t % ("%s-%s" % (vid, vid))
        data = urllib.urlopen(url).read()
        pv = re.findall('\d+":"(\d+)', data)[0]
        up = 0
        down = 0

        url_tt = "http://comment5.news.sina.com.cn/cmnt/info_wb?channel=movie&newsid=%s&page=1&callback="
        url2 = url_tt % (nid,)
        data2 = urllib.urlopen(url2).read()
        data2 = data2[1:-1]
        dj = json.loads(data2)
        comments = dj["result"]['data']['total_number']

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Example #2

0

Show file

File: video_detail.py Project: hackrole/scrapy-utils

    def parse_letv_com(self, response):
        hxs = HtmlXPathSelector(response)

        pid = hxs.re('pid:(\d+)')[0]
        vid = hxs.re('vid:(\d+)')[0]
        mid = hxs.re('mmsid:(\d+)')[0]
        # the pv
        url_t = "http://stat.letv.com/vplay/queryMmsTotalPCount?callback=&cid=1&vid=%s&mid=%s&pid=%s"
        #print "<<<<<<<<<<<<<<<<<<<<<<<<<<<"
        #print pid, vid, mid
        url = url_t % (vid, mid, pid)
        text = urllib.urlopen(url).read()
        pv = re.findall('media_play_count.*?(\d+)', text)[0]
        up = 0
        down = 0

        # the comments count
        url_tt = "http://api.my.letv.com/vcm/api/g?jsonp=&type=video&notice=1&pid=%s&xid=%s&mmsid=%s&rows=10&page=1"
        url2 = url_tt % (pid, vid, mid)
        text2 = urllib.urlopen(url2).read()
        comments = re.findall('total.*?(\d+)', text2)[0]

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Example #3

0

Show file

File: video_detail.py Project: hackrole/scrapy-utils

    def parse_sohu_com(self, response):
        hxs = HtmlXPathSelector(response)

        vid = ''.join(hxs.re('var vid="(\d+)')).strip()
        pid = ''.join(hxs.re('var playlistId="(\d+)')).strip()
        cid = ''.join(hxs.re('var cid="(\d+)')).strip()
        # msg = "sohu id: vid %s, pid %s, cid %s" % (vid, pid, cid)
        # self.log(msg)

        url_t = "http://count.vrs.sohu.com/count/stat.do?videoId=%s&playlistId=%s&categoryId=%s"
        url1 = url_t % (vid, pid, cid)
        text = urllib.urlopen(url1).read()
        pv = ''.join(re.findall('(\d+)', text))

        url_t1 = "http://score.my.tv.sohu.com/digg/get.do?vid=%s&type=%s"
        url1 = url_t1 % (vid, cid)
        text = urllib.urlopen(url1).read()
        t = text[text.find('{'): text.rfind('}') + 1]
        dj = json.loads(t)
        up = dj['upCount']
        down = dj['downCount']

        url_t2 = "http://access.tv.sohu.com/reply/list.do?objid=%s&subobjid=%s&objtype=%s"
        url2 = url_t2 % (pid, vid, cid)
        text = urllib.urlopen(url2).read()
        comments = re.findall('"allCount":(\d+)', text)[0]

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Example #4

0

Show file

File: gsScrap_spider.py Project: singlas/webCrawler

    def parse_item(self, response):
        self.log('Hi, this is an item page! %s' % response.url)
        hxs = HtmlXPathSelector(response)
        item = gsScrapItem()
        item['URL'] = response.url
        item['Title'] = hxs.select('//title/text()').extract().pop().strip()

        #_gaq.push(['_trackPageview']);
        gaq1 = hxs.select('//script/text()').re(
            r"_gaq\.push\( *\[ *\'_trackPageview\' *\] *\)")
        #_gaq.push(['_setAccount', 'UA-XXXXX-Y']);
        gaq2 = hxs.select('//script/text()').re(
            r"_gaq\.push\( *\[ *\'_setAccount\' *\, *'UA\-.*\-.*\'*\] *\)")

        #gs.js
        gajs1 = hxs.select('//script/text()').re(r'ga\.js')

        item['GA'] = int(bool(len(gajs1) and len(gaq1) and len(gaq2)))

        #analytics.js
        ua1 = hxs.select('//script/text()').re(r'analytics\.js')
        #ga('send', 'pageview');
        ua2 = hxs.select('//script/text()').re(
            r"ga\( *\[ *\'send\' *\, *\'pageview\'*\] *\)")
        #ga('create', 'UA-XXXX-Y');
        ua3 = hxs.select('//script/text()').re(
            r"ga\( *\[ *\'create\' *\, *\'UA\-.*\-.*\'*\] *\)")
        item['Universal_Analytics'] = int(
            bool(len(ua1) and len(ua2) and len(ua3)))

        #dc.js
        dcjs1 = hxs.select('//script/text()').re(r'dc\.js')
        item['GA_Remarketing'] = int(
            bool(len(dcjs1) and len(gaq1) and len(gaq2)))

        #var google_conversion_id = XXXXXXXXX;
        gaw1 = hxs.select('//script/text()').re(
            r"var * google_conversion_id *\= * \d+")
        gaw2 = hxs.re(r'googleadservices\.com\/pagead\/conversion\.js')
        item['Google_AdWords'] = int(bool(len(gaw1) and len(gaw2)))

        #_gas.push(['_trackPageview']);
        gas1 = hxs.select('//script/text()').re(
            r"_gas\.push\( *\[ *\'_trackPageview\' *\] *\)")
        #_gas.push(['_setAccount', 'UA-XXXXX-Y']);
        gas2 = hxs.select('//script/text()').re(
            r"_gas\.push\( *\[ *\'_setAccount\' *\, *'UA\-.*\-.*\'*\] *\)")
        #gas-1.10.1.min.js
        gasjs1 = hxs.select('//script/text()').re(r'gas.*\.js')
        item['GA_Steroids'] = int(bool(
            len(gasjs1) and len(gas1) and len(gas2)))

        #www.googletagmanager.com/ns.html and
        gtm1 = hxs.re(r'googletagmanager\.com\/ns\.html')
        #www.googletagmanager.com/gtm.js
        gtm2 = hxs.re(r'googletagmanager\.com\/gtm\.js')
        item['Google_Tag_Manager'] = int(bool(len(gtm1) and len(gtm2)))
        return item

Example #5

0

Show file

File: gsScrap_spider.py Project: singlas/webCrawler

    def parse_item(self, response):
        self.log('Hi, this is an item page! %s' % response.url)
        hxs = HtmlXPathSelector(response)
        item = gsScrapItem()
        item['URL'] = response.url
        item['Title'] = hxs.select('//title/text()').extract().pop().strip()   
        
        #_gaq.push(['_trackPageview']);
        gaq1 = hxs.select('//script/text()').re(r"_gaq\.push\( *\[ *\'_trackPageview\' *\] *\)")     
        #_gaq.push(['_setAccount', 'UA-XXXXX-Y']);    
        gaq2 = hxs.select('//script/text()').re(r"_gaq\.push\( *\[ *\'_setAccount\' *\, *'UA\-.*\-.*\'*\] *\)")

        #gs.js
        gajs1 = hxs.select('//script/text()').re(r'ga\.js')  
        
        item['GA'] = int(bool(len(gajs1) and len(gaq1) and len(gaq2)))


        #analytics.js
        ua1 = hxs.select('//script/text()').re(r'analytics\.js')  
        #ga('send', 'pageview');
        ua2 = hxs.select('//script/text()').re(r"ga\( *\[ *\'send\' *\, *\'pageview\'*\] *\)") 
        #ga('create', 'UA-XXXX-Y'); 
        ua3 = hxs.select('//script/text()').re(r"ga\( *\[ *\'create\' *\, *\'UA\-.*\-.*\'*\] *\)")
        item['Universal_Analytics'] = int(bool(len(ua1) and len(ua2) and len(ua3)))

        #dc.js
        dcjs1 = hxs.select('//script/text()').re(r'dc\.js')  
        item['GA_Remarketing'] = int(bool(len(dcjs1) and len(gaq1) and len(gaq2)))

        #var google_conversion_id = XXXXXXXXX;
        gaw1 = hxs.select('//script/text()').re(r"var * google_conversion_id *\= * \d+") 
        gaw2 = hxs.re(r'googleadservices\.com\/pagead\/conversion\.js' )
        item['Google_AdWords']= int(bool(len(gaw1) and len(gaw2)))

        #_gas.push(['_trackPageview']);
        gas1 = hxs.select('//script/text()').re(r"_gas\.push\( *\[ *\'_trackPageview\' *\] *\)")     
        #_gas.push(['_setAccount', 'UA-XXXXX-Y']);    
        gas2 = hxs.select('//script/text()').re(r"_gas\.push\( *\[ *\'_setAccount\' *\, *'UA\-.*\-.*\'*\] *\)")
        #gas-1.10.1.min.js
        gasjs1 = hxs.select('//script/text()').re(r'gas.*\.js')  
        item['GA_Steroids'] = int(bool(len(gasjs1) and len(gas1) and len(gas2)))

        #www.googletagmanager.com/ns.html and
        gtm1 = hxs.re(r'googletagmanager\.com\/ns\.html') 
        #www.googletagmanager.com/gtm.js
        gtm2 = hxs.re(r'googletagmanager\.com\/gtm\.js') 
        item['Google_Tag_Manager'] = int(bool(len(gtm1) and len(gtm2)))
        return item

Example #6

0

Show file

File: cc98.py Project: goby/cc98_crawl

 def parse_board(self, response):
     #if "topic_" in response.body:
     x = HtmlXPathSelector(response)
     bid = response.meta['bid']
     page_count = min(int(x.re(r'<\/b>\/<b>(\d+)')[0]), 30)
     for page_index in range(1,page_count):
         yield Request(url = response.url + '&page=%i' % page_index, callback = self.parse_board_page)

Example #7

0

Show file

File: 163news.py Project: halibut735/python

 def get_time(self,response,item):
     
     hxs  = HtmlXPathSelector(response)
     time = hxs.re(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}')
     time = time[0]
     item['news_time'] = time
     print 'time: ' + time

Example #8

0

Show file

File: video_detail.py Project: hackrole/scrapy-utils

    def parse_qq_com(self, response):
        hxs = HtmlXPathSelector(response)

        pid = ''.join(hxs.re('id :"(\w+)",'))
        vid = ''.join(hxs.re('vid:"(\w+)",'))

        url_t_1 = "http://sns.video.qq.com/tvideo/fcgi-bin/batchgetplaymount?id=%s&otype=json"
        u1 = url_t_1 % (pid,)
        t1 = urllib.urlopen(u1).read()
        pv = ''.join(re.findall('"num":(\d+)', t1)).strip()

        url_t_2 = "http://sns.video.qq.com/tvideo/fcgi-bin/spvote?&t=3&otype=json&keyid=%s"
        u2 = url_t_2 % (vid,)
        t2 = urllib.urlopen(u2).read()
        tmp = re.findall('"num":(\d+)', t2)
        down, up = tmp

        url_t_3 = "http://sns.video.qq.com/fcgi-bin/liveportal/comment?otype=json&p=1&t=0&sz=10&id=%s"
        u3 = url_t_3 % (pid,)
        t3 = urllib.urlopen(u3).read()
        comments = ''.join(re.findall('"totpg":(\d+)', t3))

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Example #9

0

Show file

File: video_detail.py Project: hackrole/scrapy-utils

    def parse_youku_com(self, response):
        hxs = HtmlXPathSelector(response)
        video_id = hxs.re('var videoId.*?(\d+)')[0]

        url_t = "http://v.youku.com/v_vpactionInfo/id/%s"
        url = url_t % (video_id,)
        text = urllib.urlopen(url).read()

        hxs2 = HtmlXPathSelector(text=text)
        pv = hxs2.select('//ul[@class="row"]//span[@class="num"]/text()').extract()[0]
        pv = int(''.join(pv.split(',')))

        # others data
        d_tmp = hxs2.select('//ul[@class="half"]//span/text()').extract()
        # up and down data
        ud = d_tmp[0]
        up, down = d_tmp[0].split('/')
        up, down = int(''.join(up.split(','))), int(''.join(down.split(',')))
        # comments count
        comments = int(''.join(d_tmp[2].split(',')))

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Example #10

0

Show file

File: LawsonSpider.py Project: modeyang/ScrapyOfApp

    def parse_geo(self, response):
        #         inspect_response(response)
        hxs = HtmlXPathSelector(response)
        store = response.meta['store']

        lng, lat = hxs.re(r'(\d+\.\d+),(\d+\.\d+)')
        store.add_value('latitude', lat)
        store.add_value('longitude', lng)
        return store.load_item()

Example #11

0

Show file

File: 163news.py Project: halibut735/python

 def get_hot(self, response, item):
     
     hxs  = HtmlXPathSelector(response)
     hot = hxs.re(r'totalCount = .*,')
     hot = hot[0]
     pattern = re.compile(r'=.*,')
     hot = pattern.search(hot).group()
     hot = hot[2:-1]
     news_hot = hot
     if news_hot:
         print 'hot or comments: ' + news_hot
         item['news_hot'] = news_hot

Example #12

0

Show file

File: notes.py Project: largetalk/douban-notes

    def parse_note(self, response):
        hxs = HtmlXPathSelector(response)
        i = DoubanNotesItem()
        i['nid'] = response.url.split('/')[-2]
        owner_html = hxs.select('//*[@id="db-usr-profile"]/div[1]/a').extract()[0]
        i['owner'] = re.search('/people/(.+)/', owner_html).groups()[0]
        i['title'] = hxs.select('//*[@id="note-%s"]/div[1]/h1/text()'% i['nid']).extract()[0]
        i['url'] = response.url
        content = ''
        for c in hxs.select('//*[@id="link-report"]/text()').extract():
            content += c
        i['content'] = content

        for candidate in hxs.re('http://www.douban.com/note/\d+/'):
            if i['nid'] != re.findall('http://www.douban.com/note/(\d+)/', candidate)[0]:
                yield Request(re.findall('http://www.douban.com/note/\d+/', candidate)[0])

        yield i

Example #13

0

Show file

File: video_detail.py Project: hackrole/scrapy-utils

    def parse_pps_tv(self, response):
        hxs = HtmlXPathSelector(response)

        video_id = hxs.re("upload_id.*?(\d+)")[0]
        url_t = "http://v.pps.tv/ugc/ajax/ugc.php?type=5&upload_id=%s"
        url = url_t % (video_id,)
        data = json.loads(urllib.urlopen(url).read())
        pv = data["paly_num"]
        up = data["up"]
        down = data["down"]
        comments = data["cmt"]

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Example #14

0

Show file

File: video_detail.py Project: hackrole/scrapy-utils

    def parse_tudou_com(self, response):
        hxs = HtmlXPathSelector(response)
        #video_id = hxs.re(re.compile('iid:\s*(\d+)')
        video_id = hxs.re('iid:\s*(\d+)')[0]
        url_t = "http://www.tudou.com/tva/itemSum.srv?jsoncallback=__TVA_itemSum&iabcdefg=%s&uabcdefg=0&showArea=true&app=5"
        url = url_t % (video_id,)
        data_h = urllib.urlopen(url).read()
        # load as json
        data_h = data_h[data_h.find('(') + 1:data_h.find(')')]

        data_j = json.loads(data_h)
        pv = data_j['playNum']
        up = data_j['digNum']
        down = data_j['buryNum']
        comments = data_j['commentNum']

        item = response.meta['item']
        doc = item['doc']
        doc['pv'] = pv
        doc['up'] = up
        doc['down'] = down
        doc['comments'] = comments
        return item

Example #15

0

Show file

File: minyao.py Project: tongwenfeng/scrapy

 def parse_item(self,response):
     hxs=HtmlXPathSelector(response)
 	item=response.meta['item']
     #print item
     items=[]
     song=[]
     try:
         song=hxs.re("\[\{\"name\".*\]")
     except Exception as e:
         print e
     for s in song:
         record=json.loads(s)
         for b in record:
             #print '------------------------------'
             #print b['name'],b['rawUrl']
             dou = DoubanItem()
             dou['singer']=item['singer']
             dou['singerurl']=item['singerurl']
             dou['name'] = b['name']
             dou['url'] = b['rawUrl']
             #download(item['url'],item['name'])
             items.append(dou)
     print items

Example #16

0

Show file

File: cc98.py Project: goby/cc98_crawl

 def parse_board_page(self, response):
     x = HtmlXPathSelector(response)
     bid = x.re(r'var currentBoardID = (\d+)')[0]
     for tid in x.re(r'topic_(\d+)'):
     	url = 'http://www.cc98.org/dispbbs.asp?boardid=%s&id=%s' % (bid, tid)
     	yield Request(url = url, callback = self.parse_thread)

Example #17

0

Show file

    def parse_auction(self, response):
        hxs = HtmlXPathSelector(response)

        item = AuctionItem()

        # Get internal id
        url = urlparse(response.url)
        qs = parse_qs(url.query)

        if 'auctionId' in qs:
            item['internal_id'] = qs['auctionId'][0]
        elif 'id' in qs:
            item['internal_id'] = qs['id'][0]

        # Front page
        item['event_id'], item['lot_id'] = hxs.select(
            '//div[@class="event-details"]//span/text()').extract()
        item['description'] = ''.join(
            hxs.select(
                '//div[@id="auction_lotDetails"]/text()').extract()).strip()

        item['start_date'] = self._grab_info(hxs, 'Open Time:')
        item['end_date'] = self._grab_info(hxs, 'Close Time:')

        item['start_price'] = self._grab_info(hxs, 'Opening Bid:')
        item['current_price'] = self._grab_info(hxs, 'Current Bid:')

        # Premium
        val = hxs.re('A (\d+)% Buyer\'s Premium applies to this lot.')
        if val:
            item['buyers_premium'] = val[0]
        else:
            item['buyers_premium'] = '0'

        # Contact
        self._parse_address(hxs, item)
        item['country'] = self._grab_tab_field(hxs, 'auction_contact',
                                               'Country of Origin:')

        item['contact_phone'] = self._grab_tab_field(hxs, 'auction_contact',
                                                     'Contact Phone:')
        item['contact_fax'] = self._grab_tab_field(hxs, 'auction_contact',
                                                   'Contact Fax:')

        item['facility_manager'] = self._grab_tab_field(
            hxs, 'auction_contact', 'Facility Manager:')
        item['facility_email'] = self._grab_tab_field(hxs, 'auction_contact',
                                                      'Facility EMail:')

        # Payment
        item['payment_info'] = self._clean_field(''.join(
            hxs.select('//div[@id="auction_payment"]//text()').extract()))

        # Shipping
        item['lot_weight'] = self._grab_tab_field(hxs, 'auction_shippingInfo',
                                                  'Approximate Lot Weight:')
        item['weight_uom'] = self._grab_tab_field(hxs, 'auction_shippingInfo',
                                                  'Weight UOM:')
        item['shipping_qty'] = self._grab_tab_field(hxs,
                                                    'auction_shippingInfo',
                                                    'Shipping QTY:')
        item['approx_dim'] = self._grab_tab_field(
            hxs, 'auction_shippingInfo', 'Approximate Dim. or Lot Cube:')

        # Preview dimensions
        item['preview_arrangements'] = self._grab_tab_field(
            hxs, 'auction_preview', 'Preview Arrangements:')
        item['loadout_procedures'] = self._grab_tab_field(
            hxs, 'auction_preview', 'Loadout Procedures:')
        item['security_procedures'] = self._grab_tab_field(
            hxs, 'auction_preview', 'Secuity Procedures:')

        return item