def parse_sina_com_cn(self, response): hxs = HtmlXPathSelector(response) vid = hxs.re('vid:.*?(\d+)\|\d+')[0] nid = hxs.re("newsid:'([-\w]+)")[0] url_t = "http://count.kandian.com/getCount.php?vids=%s&action=flash" url = url_t % ("%s-%s" % (vid, vid)) data = urllib.urlopen(url).read() pv = re.findall('\d+":"(\d+)', data)[0] up = 0 down = 0 url_tt = "http://comment5.news.sina.com.cn/cmnt/info_wb?channel=movie&newsid=%s&page=1&callback=" url2 = url_tt % (nid,) data2 = urllib.urlopen(url2).read() data2 = data2[1:-1] dj = json.loads(data2) comments = dj["result"]['data']['total_number'] item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse_letv_com(self, response): hxs = HtmlXPathSelector(response) pid = hxs.re('pid:(\d+)')[0] vid = hxs.re('vid:(\d+)')[0] mid = hxs.re('mmsid:(\d+)')[0] # the pv url_t = "http://stat.letv.com/vplay/queryMmsTotalPCount?callback=&cid=1&vid=%s&mid=%s&pid=%s" #print "<<<<<<<<<<<<<<<<<<<<<<<<<<<" #print pid, vid, mid url = url_t % (vid, mid, pid) text = urllib.urlopen(url).read() pv = re.findall('media_play_count.*?(\d+)', text)[0] up = 0 down = 0 # the comments count url_tt = "http://api.my.letv.com/vcm/api/g?jsonp=&type=video¬ice=1&pid=%s&xid=%s&mmsid=%s&rows=10&page=1" url2 = url_tt % (pid, vid, mid) text2 = urllib.urlopen(url2).read() comments = re.findall('total.*?(\d+)', text2)[0] item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse_sohu_com(self, response): hxs = HtmlXPathSelector(response) vid = ''.join(hxs.re('var vid="(\d+)')).strip() pid = ''.join(hxs.re('var playlistId="(\d+)')).strip() cid = ''.join(hxs.re('var cid="(\d+)')).strip() # msg = "sohu id: vid %s, pid %s, cid %s" % (vid, pid, cid) # self.log(msg) url_t = "http://count.vrs.sohu.com/count/stat.do?videoId=%s&playlistId=%s&categoryId=%s" url1 = url_t % (vid, pid, cid) text = urllib.urlopen(url1).read() pv = ''.join(re.findall('(\d+)', text)) url_t1 = "http://score.my.tv.sohu.com/digg/get.do?vid=%s&type=%s" url1 = url_t1 % (vid, cid) text = urllib.urlopen(url1).read() t = text[text.find('{'): text.rfind('}') + 1] dj = json.loads(t) up = dj['upCount'] down = dj['downCount'] url_t2 = "http://access.tv.sohu.com/reply/list.do?objid=%s&subobjid=%s&objtype=%s" url2 = url_t2 % (pid, vid, cid) text = urllib.urlopen(url2).read() comments = re.findall('"allCount":(\d+)', text)[0] item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse_item(self, response): self.log('Hi, this is an item page! %s' % response.url) hxs = HtmlXPathSelector(response) item = gsScrapItem() item['URL'] = response.url item['Title'] = hxs.select('//title/text()').extract().pop().strip() #_gaq.push(['_trackPageview']); gaq1 = hxs.select('//script/text()').re( r"_gaq\.push\( *\[ *\'_trackPageview\' *\] *\)") #_gaq.push(['_setAccount', 'UA-XXXXX-Y']); gaq2 = hxs.select('//script/text()').re( r"_gaq\.push\( *\[ *\'_setAccount\' *\, *'UA\-.*\-.*\'*\] *\)") #gs.js gajs1 = hxs.select('//script/text()').re(r'ga\.js') item['GA'] = int(bool(len(gajs1) and len(gaq1) and len(gaq2))) #analytics.js ua1 = hxs.select('//script/text()').re(r'analytics\.js') #ga('send', 'pageview'); ua2 = hxs.select('//script/text()').re( r"ga\( *\[ *\'send\' *\, *\'pageview\'*\] *\)") #ga('create', 'UA-XXXX-Y'); ua3 = hxs.select('//script/text()').re( r"ga\( *\[ *\'create\' *\, *\'UA\-.*\-.*\'*\] *\)") item['Universal_Analytics'] = int( bool(len(ua1) and len(ua2) and len(ua3))) #dc.js dcjs1 = hxs.select('//script/text()').re(r'dc\.js') item['GA_Remarketing'] = int( bool(len(dcjs1) and len(gaq1) and len(gaq2))) #var google_conversion_id = XXXXXXXXX; gaw1 = hxs.select('//script/text()').re( r"var * google_conversion_id *\= * \d+") gaw2 = hxs.re(r'googleadservices\.com\/pagead\/conversion\.js') item['Google_AdWords'] = int(bool(len(gaw1) and len(gaw2))) #_gas.push(['_trackPageview']); gas1 = hxs.select('//script/text()').re( r"_gas\.push\( *\[ *\'_trackPageview\' *\] *\)") #_gas.push(['_setAccount', 'UA-XXXXX-Y']); gas2 = hxs.select('//script/text()').re( r"_gas\.push\( *\[ *\'_setAccount\' *\, *'UA\-.*\-.*\'*\] *\)") #gas-1.10.1.min.js gasjs1 = hxs.select('//script/text()').re(r'gas.*\.js') item['GA_Steroids'] = int(bool( len(gasjs1) and len(gas1) and len(gas2))) #www.googletagmanager.com/ns.html and gtm1 = hxs.re(r'googletagmanager\.com\/ns\.html') #www.googletagmanager.com/gtm.js gtm2 = hxs.re(r'googletagmanager\.com\/gtm\.js') item['Google_Tag_Manager'] = int(bool(len(gtm1) and len(gtm2))) return item
def parse_item(self, response): self.log('Hi, this is an item page! %s' % response.url) hxs = HtmlXPathSelector(response) item = gsScrapItem() item['URL'] = response.url item['Title'] = hxs.select('//title/text()').extract().pop().strip() #_gaq.push(['_trackPageview']); gaq1 = hxs.select('//script/text()').re(r"_gaq\.push\( *\[ *\'_trackPageview\' *\] *\)") #_gaq.push(['_setAccount', 'UA-XXXXX-Y']); gaq2 = hxs.select('//script/text()').re(r"_gaq\.push\( *\[ *\'_setAccount\' *\, *'UA\-.*\-.*\'*\] *\)") #gs.js gajs1 = hxs.select('//script/text()').re(r'ga\.js') item['GA'] = int(bool(len(gajs1) and len(gaq1) and len(gaq2))) #analytics.js ua1 = hxs.select('//script/text()').re(r'analytics\.js') #ga('send', 'pageview'); ua2 = hxs.select('//script/text()').re(r"ga\( *\[ *\'send\' *\, *\'pageview\'*\] *\)") #ga('create', 'UA-XXXX-Y'); ua3 = hxs.select('//script/text()').re(r"ga\( *\[ *\'create\' *\, *\'UA\-.*\-.*\'*\] *\)") item['Universal_Analytics'] = int(bool(len(ua1) and len(ua2) and len(ua3))) #dc.js dcjs1 = hxs.select('//script/text()').re(r'dc\.js') item['GA_Remarketing'] = int(bool(len(dcjs1) and len(gaq1) and len(gaq2))) #var google_conversion_id = XXXXXXXXX; gaw1 = hxs.select('//script/text()').re(r"var * google_conversion_id *\= * \d+") gaw2 = hxs.re(r'googleadservices\.com\/pagead\/conversion\.js' ) item['Google_AdWords']= int(bool(len(gaw1) and len(gaw2))) #_gas.push(['_trackPageview']); gas1 = hxs.select('//script/text()').re(r"_gas\.push\( *\[ *\'_trackPageview\' *\] *\)") #_gas.push(['_setAccount', 'UA-XXXXX-Y']); gas2 = hxs.select('//script/text()').re(r"_gas\.push\( *\[ *\'_setAccount\' *\, *'UA\-.*\-.*\'*\] *\)") #gas-1.10.1.min.js gasjs1 = hxs.select('//script/text()').re(r'gas.*\.js') item['GA_Steroids'] = int(bool(len(gasjs1) and len(gas1) and len(gas2))) #www.googletagmanager.com/ns.html and gtm1 = hxs.re(r'googletagmanager\.com\/ns\.html') #www.googletagmanager.com/gtm.js gtm2 = hxs.re(r'googletagmanager\.com\/gtm\.js') item['Google_Tag_Manager'] = int(bool(len(gtm1) and len(gtm2))) return item
def parse_board(self, response): #if "topic_" in response.body: x = HtmlXPathSelector(response) bid = response.meta['bid'] page_count = min(int(x.re(r'<\/b>\/<b>(\d+)')[0]), 30) for page_index in range(1,page_count): yield Request(url = response.url + '&page=%i' % page_index, callback = self.parse_board_page)
def get_time(self,response,item): hxs = HtmlXPathSelector(response) time = hxs.re(r'\d{4}-\d{2}-\d{2} \d{2}:\d{2}:\d{2}') time = time[0] item['news_time'] = time print 'time: ' + time
def parse_qq_com(self, response): hxs = HtmlXPathSelector(response) pid = ''.join(hxs.re('id :"(\w+)",')) vid = ''.join(hxs.re('vid:"(\w+)",')) url_t_1 = "http://sns.video.qq.com/tvideo/fcgi-bin/batchgetplaymount?id=%s&otype=json" u1 = url_t_1 % (pid,) t1 = urllib.urlopen(u1).read() pv = ''.join(re.findall('"num":(\d+)', t1)).strip() url_t_2 = "http://sns.video.qq.com/tvideo/fcgi-bin/spvote?&t=3&otype=json&keyid=%s" u2 = url_t_2 % (vid,) t2 = urllib.urlopen(u2).read() tmp = re.findall('"num":(\d+)', t2) down, up = tmp url_t_3 = "http://sns.video.qq.com/fcgi-bin/liveportal/comment?otype=json&p=1&t=0&sz=10&id=%s" u3 = url_t_3 % (pid,) t3 = urllib.urlopen(u3).read() comments = ''.join(re.findall('"totpg":(\d+)', t3)) item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse_youku_com(self, response): hxs = HtmlXPathSelector(response) video_id = hxs.re('var videoId.*?(\d+)')[0] url_t = "http://v.youku.com/v_vpactionInfo/id/%s" url = url_t % (video_id,) text = urllib.urlopen(url).read() hxs2 = HtmlXPathSelector(text=text) pv = hxs2.select('//ul[@class="row"]//span[@class="num"]/text()').extract()[0] pv = int(''.join(pv.split(','))) # others data d_tmp = hxs2.select('//ul[@class="half"]//span/text()').extract() # up and down data ud = d_tmp[0] up, down = d_tmp[0].split('/') up, down = int(''.join(up.split(','))), int(''.join(down.split(','))) # comments count comments = int(''.join(d_tmp[2].split(','))) item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse_geo(self, response): # inspect_response(response) hxs = HtmlXPathSelector(response) store = response.meta['store'] lng, lat = hxs.re(r'(\d+\.\d+),(\d+\.\d+)') store.add_value('latitude', lat) store.add_value('longitude', lng) return store.load_item()
def get_hot(self, response, item): hxs = HtmlXPathSelector(response) hot = hxs.re(r'totalCount = .*,') hot = hot[0] pattern = re.compile(r'=.*,') hot = pattern.search(hot).group() hot = hot[2:-1] news_hot = hot if news_hot: print 'hot or comments: ' + news_hot item['news_hot'] = news_hot
def parse_note(self, response): hxs = HtmlXPathSelector(response) i = DoubanNotesItem() i['nid'] = response.url.split('/')[-2] owner_html = hxs.select('//*[@id="db-usr-profile"]/div[1]/a').extract()[0] i['owner'] = re.search('/people/(.+)/', owner_html).groups()[0] i['title'] = hxs.select('//*[@id="note-%s"]/div[1]/h1/text()'% i['nid']).extract()[0] i['url'] = response.url content = '' for c in hxs.select('//*[@id="link-report"]/text()').extract(): content += c i['content'] = content for candidate in hxs.re('http://www.douban.com/note/\d+/'): if i['nid'] != re.findall('http://www.douban.com/note/(\d+)/', candidate)[0]: yield Request(re.findall('http://www.douban.com/note/\d+/', candidate)[0]) yield i
def parse_pps_tv(self, response): hxs = HtmlXPathSelector(response) video_id = hxs.re("upload_id.*?(\d+)")[0] url_t = "http://v.pps.tv/ugc/ajax/ugc.php?type=5&upload_id=%s" url = url_t % (video_id,) data = json.loads(urllib.urlopen(url).read()) pv = data["paly_num"] up = data["up"] down = data["down"] comments = data["cmt"] item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse_tudou_com(self, response): hxs = HtmlXPathSelector(response) #video_id = hxs.re(re.compile('iid:\s*(\d+)') video_id = hxs.re('iid:\s*(\d+)')[0] url_t = "http://www.tudou.com/tva/itemSum.srv?jsoncallback=__TVA_itemSum&iabcdefg=%s&uabcdefg=0&showArea=true&app=5" url = url_t % (video_id,) data_h = urllib.urlopen(url).read() # load as json data_h = data_h[data_h.find('(') + 1:data_h.find(')')] data_j = json.loads(data_h) pv = data_j['playNum'] up = data_j['digNum'] down = data_j['buryNum'] comments = data_j['commentNum'] item = response.meta['item'] doc = item['doc'] doc['pv'] = pv doc['up'] = up doc['down'] = down doc['comments'] = comments return item
def parse_item(self,response): hxs=HtmlXPathSelector(response) item=response.meta['item'] #print item items=[] song=[] try: song=hxs.re("\[\{\"name\".*\]") except Exception as e: print e for s in song: record=json.loads(s) for b in record: #print '------------------------------' #print b['name'],b['rawUrl'] dou = DoubanItem() dou['singer']=item['singer'] dou['singerurl']=item['singerurl'] dou['name'] = b['name'] dou['url'] = b['rawUrl'] #download(item['url'],item['name']) items.append(dou) print items
def parse_board_page(self, response): x = HtmlXPathSelector(response) bid = x.re(r'var currentBoardID = (\d+)')[0] for tid in x.re(r'topic_(\d+)'): url = 'http://www.cc98.org/dispbbs.asp?boardid=%s&id=%s' % (bid, tid) yield Request(url = url, callback = self.parse_thread)
def parse_auction(self, response): hxs = HtmlXPathSelector(response) item = AuctionItem() # Get internal id url = urlparse(response.url) qs = parse_qs(url.query) if 'auctionId' in qs: item['internal_id'] = qs['auctionId'][0] elif 'id' in qs: item['internal_id'] = qs['id'][0] # Front page item['event_id'], item['lot_id'] = hxs.select( '//div[@class="event-details"]//span/text()').extract() item['description'] = ''.join( hxs.select( '//div[@id="auction_lotDetails"]/text()').extract()).strip() item['start_date'] = self._grab_info(hxs, 'Open Time:') item['end_date'] = self._grab_info(hxs, 'Close Time:') item['start_price'] = self._grab_info(hxs, 'Opening Bid:') item['current_price'] = self._grab_info(hxs, 'Current Bid:') # Premium val = hxs.re('A (\d+)% Buyer\'s Premium applies to this lot.') if val: item['buyers_premium'] = val[0] else: item['buyers_premium'] = '0' # Contact self._parse_address(hxs, item) item['country'] = self._grab_tab_field(hxs, 'auction_contact', 'Country of Origin:') item['contact_phone'] = self._grab_tab_field(hxs, 'auction_contact', 'Contact Phone:') item['contact_fax'] = self._grab_tab_field(hxs, 'auction_contact', 'Contact Fax:') item['facility_manager'] = self._grab_tab_field( hxs, 'auction_contact', 'Facility Manager:') item['facility_email'] = self._grab_tab_field(hxs, 'auction_contact', 'Facility EMail:') # Payment item['payment_info'] = self._clean_field(''.join( hxs.select('//div[@id="auction_payment"]//text()').extract())) # Shipping item['lot_weight'] = self._grab_tab_field(hxs, 'auction_shippingInfo', 'Approximate Lot Weight:') item['weight_uom'] = self._grab_tab_field(hxs, 'auction_shippingInfo', 'Weight UOM:') item['shipping_qty'] = self._grab_tab_field(hxs, 'auction_shippingInfo', 'Shipping QTY:') item['approx_dim'] = self._grab_tab_field( hxs, 'auction_shippingInfo', 'Approximate Dim. or Lot Cube:') # Preview dimensions item['preview_arrangements'] = self._grab_tab_field( hxs, 'auction_preview', 'Preview Arrangements:') item['loadout_procedures'] = self._grab_tab_field( hxs, 'auction_preview', 'Loadout Procedures:') item['security_procedures'] = self._grab_tab_field( hxs, 'auction_preview', 'Secuity Procedures:') return item