def parse(self, response): #获取一个选择器 hxs = Selector(response) print hxs.extract() for con in hxs.xpath(self.xpathConf.get("parse_xpath")): s = Selector(text=con.extract()) if s.xpath(self.xpathConf.get("is_parse_xpath")).__len__()>0: r = [str(self.accountNum)] imgs = s.xpath(self.xpathConf.get("face_image")).extract() if len(imgs)>0: r.append(imgs[0]) else: r.append("") num_info = s.xpath(self.xpathConf.get("num_info")).extract() if len(num_info)>0: for num in fo.getWeiboCnUserInfo(num_info[0]): r.append(num) else: r.append(0) r.append(0) r.append(0) #生成保存Redis的格式 save_info = "%s|%s|%s|%s|%s" % tuple(r) print(r) print save_info self.server.lpush(self.out_key,save_info)
def parse(self, response): campionato=Selector(response).xpath('//h1[@class="page-title"]/text()') giornata= Selector(response).xpath('//h3/text()') prons = Selector(response).xpath('//tr[@class="field_collection_item odd" or @class="field_collection_item even"]') next_link=Selector(response).xpath('//li[@class="pager-next last active"]/*[@title="Vai alla pagina successiva"]/@href').extract() if len(next_link): yield self.make_requests_from_url('http://www.nostradamusbet.it'+next_link[0]) for pronostico in prons: try: item = OnlineItem() item['campionato'] = campionato.extract()[0] item['giornata'] = giornata.extract()[1] item['dataora'] = pronostico.xpath('td[@class="field_data_ora"]/*[@content]/@content').extract()[0] item['casa']=pronostico.xpath('td[@class="field_home"]/text()').extract()[0] item['fuori']=pronostico.xpath('td[@class="field_away"]/text()').extract()[0] item['pronostico']=pronostico.xpath('td[@class="field_pronostico"]/text()').extract()[0] item['quota']=pronostico.xpath('td[@class="field_quota"]/text()').extract()[0] #item['esito']=pronostico.xpath('td[@class="field_esito"]/text()').extract()[0] #item['cross']=pronostico.xpath('td[@class="field_risultato"]/span/text()').extract()[0] item['fonte']='Nostr_Norm' yield item except IndexError as e: pass
def parse(self, response): self.driver.get(response.url) el = Selector(text=self.driver.page_source).xpath('//*[@id="content"]/table/tbody/tr/td[1]/a/@href') requestList=[] for r in el.extract(): requestList.append(Request(response.urljoin(r), callback=self.parsePost)) el = Selector(text=self.driver.page_source).xpath('//ul[contains(@class, "pagination")]/li/a/@href') for r in el.extract(): requestList.append(Request(response.urljoin(r))) if len(requestList)>0: return requestList else: self.driver.find_element_by_xpath('//*[@id="content"]/div[1]/div[1]/a').click() self.driver.get(response.url) el = Selector(text=self.driver.page_source).xpath('//*[@id="content"]/table/tbody/tr/td[1]/a/@href') requestList=[] for r in el.extract(): requestList.append(Request(response.urljoin(r), callback=self.parsePost)) el = Selector(text=self.driver.page_source).xpath('//ul[contains(@class, "pagination")]/li/a/@href') for r in el.extract(): requestList.append(Request(response.urljoin(r))) if len(requestList)>0: return requestList self.driver.close()
def createItem(self, response): item = WikiItem() # init Fields for correct sort item['uid'] = "" # URL from crawled Site (used for generatedUID -> elastic) m = re.search('(http[s]?:\/\/)?([^\/\s]+)(.*)', response.url) if m: relativeUrl = m.group(3) item['url'] = "https://de.wikipedia.org" + relativeUrl else: item['url'] = "https://de.wikipedia.org" + url responseSelector = Selector(response) # Plugin for easy HTML parsing soup = BeautifulSoup(responseSelector.extract(), 'html.parser') item['pageTitle'] = soup.find('title').text item['text'] = "" for p_tag in soup.findAll('p'): item['text'] = item['text'] + p_tag.text.replace( "\t", " ").replace("\r", " ").replace("\n", " ").replace( " ", " ").strip() # HTML Content of parsed Component item['html'] = responseSelector.extract() # Generated UID which is used as UID for Elastic, so every Item is Unique item['uid'] = self.generateUID(item, 'utf-8') return item
def parse_content(self, body_content, update_time): #解析表格Title parse_body_content = Selector( text=body_content).xpath('//table/tbody/tr/td/font/h2/text()') if not parse_body_content: parse_detail = '' for parse_body_content in Selector( text=body_content).xpath('//table/tbody/tr'): counter = 0 for parse_table_td_font in Selector( text=parse_body_content.extract()).xpath( '//td/font/text()'): # parse_detail += parse_table_td_font.extract() +',' if counter == 0: parse_detail += ';' + parse_table_td_font.extract( ) + ';' else: parse_detail += parse_table_td_font.extract() + ',' counter = counter + 1 self.parse_to_json(parse_detail, update_time) else: # print "parse data : " + parse_body_content[0].extract() for parse_body_content in Selector(text=body_content).xpath( '//table/tbody/tr/td/font/h2/text()'): if parse_body_content.extract() == unicode( '無停班停課訊息。', 'utf-8'): print "目前無停班停課訊息。" else: print "Something Happened..."
def parse_item(self, response): # print(response.request.headers) sel = Selector(response) if len(sel.extract() ) < 10000: # this is an empirical value to prevent error page # log.msg(str(len(sel.extract())) + " Retrying item with " + response.request.body, level=log.INFO) new_request = response.request.copy() new_request.dont_filter = True yield new_request else: file = open( "C:/Users/Dell/Desktop/test/itme_%s.html" % str(self.item_count), "w") for line in sel.extract(): file.write(line.encode("utf-8")) file.close() self.item_count = self.item_count + 1 log.msg("item length is " + str(len(sel.extract()))) item = ScrapostItem() item["number"] = sel.xpath( '//td[@class="d1045m32"]/span[@class="label d1045m33"]' )[0].xpath("text()").extract() con1 = sel.xpath('//td[@class="d1045m10"]/span') item["status"] = con1[0].xpath("text()").extract() item["price"] = con1[1].xpath("text()").extract() item["location"] = sel.xpath( '//td[@class="d1045m36"]/span')[0].xpath("text()").extract() yield item
def parse(self, response): #获取一个选择器 hxs = Selector(response) print hxs.extract() for con in hxs.xpath(self.xpathConf.get("parse_xpath")): s = Selector(text=con.extract()) if s.xpath(self.xpathConf.get("is_parse_xpath")).__len__() > 0: r = [str(self.accountNum)] imgs = s.xpath(self.xpathConf.get("face_image")).extract() if len(imgs) > 0: r.append(imgs[0]) else: r.append("") num_info = s.xpath(self.xpathConf.get("num_info")).extract() if len(num_info) > 0: for num in fo.getWeiboCnUserInfo(num_info[0]): r.append(num) else: r.append(0) r.append(0) r.append(0) #生成保存Redis的格式 save_info = "%s|%s|%s|%s|%s" % tuple(r) print(r) print save_info self.server.lpush(self.out_key, save_info)
def parse2(self, response): item = response.meta['item'] movie_type = Selector(response).xpath( "//div[@class='movie-brief-container']//a/text()") print(movie_type.extract()) plan_date = Selector(response).xpath( "//div[@class='movie-brief-container']//ul/li[3]/text()") print(plan_date.extract()) item['type'] = movie_type.extract() item['date'] = plan_date.extract() yield item
def get_tile_detail(self, response): question_tile_ctor = Selector(response).xpath('//h2[@class="zm-item-title zm-editable-content"]/text()') question_detail_ctor = Selector(response).xpath('//div[@class="zm-editable-content"]/text()') tiles = question_tile_ctor.extract() details = question_detail_ctor.extract() tile = (None if len(tiles) < 1 else tiles[0]) detail = (None if len(details) < 1 else details[0]) return tile, detail
def parse(self, response): self.driver.get(response.url) el = Selector(text=self.driver.page_source).xpath('//ul[contains(@class, "thread-list")]/li//h3[contains(@class, "title")]/a/@href') requestList=[] for r in el.extract(): requestList.append(Request(response.urljoin(r), callback=self.parsePost)) el = Selector(text=self.driver.page_source).xpath('//*[@id="group-discussions"]/form[1]/a') for r in el.extract(): requestList.append(Request(response.urljoin(r), callback=self.parsePost)) if len(requestList)>0: return requestList self.driver.close()
def parse_stock(self, response): # 从当前html中挑选出colspan="3"且align="left"且height="18"且width="300"所有的td item = response.meta["item"] select_res = Selector(response=response).xpath( '//td[@colspan="3" and @align="left" and @height="18" and @width="300"]/font' ) item["business"] = Selector(text=select_res.extract()[1]).xpath( '//text()').extract()[0].strip() item["boss"] = Selector(text=select_res.extract()[2]).xpath( '//text()').extract()[0].strip() item["industry"] = Selector(text=select_res.extract()[6]).xpath( '//text()').extract()[0].strip() self.log('parse_stock %s' % item) return item
def parse(self, response): self.driver.get(response.url) el = Selector(text=self.driver.page_source).xpath('//*[@id="forum-topic-list"]/table[2]/tbody/tr/td[2]/a/@href') requestList=[] for r in el.extract(): requestList.append(Request(response.urljoin(r), callback=self.parsePost)) el = Selector(text=self.driver.page_source).xpath('//ul[contains(@class, "pager")]/li/a/@href') for r in el.extract(): # logging.info("url: "+r) requestList.append(Request(response.urljoin(r))) if len(requestList)>0: return requestList self.driver.close()
def parse_ph_info(self, response): phItem = PornVideoItem() selector = Selector(response) # logging.info(selector) _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract()) logging.debug('PH信息的JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p crwal_time = time.strftime('%Y-%m-%d %H:%M:%S', time.localtime(time.time())) phItem['crwal_time'] = crwal_time logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url + ' crwal_time:' + crwal_time) yield phItem
def convert(html): selector = Selector(text=html) article = selector.css('#article') articlestr = article.extract()[0] bib = article.xpath('//h2[starts-with(., "Bibliography")]').extract()[0] articlebody = articlestr.split(bib)[0] sel = Selector(text=articlebody) text = BeautifulSoup(sel.extract(), features="lxml").get_text() tokens = nlp(text) result = [] ignores = { ',', '.', ':', ';', '(', ')', '“', '“', '”', '"', "'", "-", "—", "[", "]", "!", "?", "{", "}", "’", "‘" } paragraphs = [] for p in sel.xpath('(//p|//blockquote)'): string = BeautifulSoup(p.extract(), features="lxml").get_text() string = string.replace('\n', ' ').lower().strip() if not string: continue paragraphs.append(string) tokens = nlp("\n".join(paragraphs)) for sent in tokens.sents: stokens = [t.text for t in sent] stokens = [t for t in stokens if t not in ignores and t] if not stokens: continue s = " ".join(stokens).strip() for i in ignores: s = s.replace(i, "") if not s: continue result.append(s) return result
def parse_ph_info(self, response): phItem = PornVideoItem() selector = Selector(response) # logging.info(selector) _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract()) # logging.debug('PH信息的JSON:') # logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p phItem['issave'] = 0 #是否本地保存 phItem['createtime'] = int(time.time()) phItem['updatetime'] = 0 phItem['local_mp4_url'] = '' print '成功抓取一条' # self.item_list.insert_one(phItem) # logging.info('duration:' + duration + ' title:' + title + ' image_url:' # + image_url + ' link_url:' + link_url) yield phItem
def search(q): '''根据书名检索馆藏信息 ''' fdata = { 'tag': 'search', 'subtag': 'simsearch', 'gcbook': 'yes', 'viewtype': '', 'flword': '', 'viewtype': '', 'q': q } resp = requests.post(search_url, data=fdata) #得到记录条数 s_res = Selector(text=resp.content.decode('utf-8')).xpath('//p[@id="page"]/span/text()') #如没有检索到记录,result_list为空 result_list = s_res.extract() if len(result_list) == 0: return "没有检索到记录" result_str = result_list[0] num = int(s_res.re('[\d]+')[0]) if num > 3: note = "" if num > 10: note = "\n注:只显示前10条结果,得到所有检索结果:" + search_url + "\n======" return result_str + "\n======" + note + getManyLinks(resp, num) else: return result_str + "\n======" + getdetail(resp, num)
def after_login(self, response) : html = Selector(response) with open("login.html",'w') as pf: pf.write(html.extract().encode('utf-8')) #body = response.body.decode(response.encoding).encode('gbk') for url in self.start_urls : yield Request(url, meta={'cookiejar': response.meta['cookiejar']})
def parse_fans(self, response): """ 抓取粉丝列表 """ # 如果是第1页,一次性获取后面的所有页 if response.url.endswith('page=1'): all_page = re.search(r'/> 1/(\d+)页</div>', response.text) if all_page: all_page = all_page.group(1) all_page = int(all_page) for page_num in range(2, all_page + 1): page_url = response.url.replace('page=1', 'page={}'.format(page_num)) yield Request(page_url, self.parse_fans, dont_filter=True, meta=response.meta, priority=3) selector = Selector(response) relationships_item = RelationPageItem() relationships_item["page_url"] = response.url relationships_item["page_raw"] = selector.extract( ) # get raw page content relationships_item["user_id"] = response.meta["user_id"] relationships_item["relationship"] = "fan" relationships_item['crawl_time_utc'] = dt.utcnow() yield relationships_item
def parse_login( self, res ): print '' print '>>>>>> parse_login: '******'最终URL: ', res.url if 'redirect_times' in res.meta: print '跳转次数: ', res.meta[ 'redirect_times' ] if 'redirect_urls' in res.meta: print '跳转列表: ', res.meta[ 'redirect_urls' ] success = re.match( self.settings[ 'RE_URL_LOGIN_SUCCESS' ], res.url, re.I ) if success: print '登录成功' print '' print '检查是否已预约:' return self.start_subscribe() # return self.start_monitor() else: print '登录失败: ' msg = Selector( res ).css( '.error_tips .et_con p::text' ) if msg: msg = msg.extract()[0] else: msg = '未知错误' print msg print '' return
def after_login(self, response): html = Selector(response) with open("login.html", 'w') as pf: pf.write(html.extract().encode('utf-8')) #body = response.body.decode(response.encoding).encode('gbk') for url in self.start_urls: yield Request(url, meta={'cookiejar': response.meta['cookiejar']})
def parse(self, response): movies = Selector( response=response).xpath('//div[@class="movie-hover-info"]') a = 0 print('movies.extract(): ', movies.extract()) for movie in movies[0:10]: #print(movie.extract()) item = PoSpidersItem() #if a < 10: film_name = movie.xpath('./div[1]/span/text()') #print(film_name.extract_first()) film_name = film_name.extract_first() print(film_name) film_type = movie.xpath('./div[2]/text()') #print(film_type.extract()[1].strip()) film_type = film_type.extract()[1].strip() print(film_type) film_time = movie.xpath('./div[4]/text()') #print(film_time.extract()[1].strip()) film_time = film_time.extract()[1].strip() print(film_time) a += 1 item['film_name'] = film_name item['film_type'] = film_type item['film_time'] = film_time yield item
def scrapingQSA(self, receita): #Preencher QSA receita['qsa'] = {} capital_social = Selector(text=self.html).xpath('/html/body/table[2]/tbody/tr/td/table/tbody/tr[3]/td[2]/text()') if capital_social: receita['qsa']['capital_social'] = capital_social.extract()[0].strip(' \r\n\t') else: receita['qsa']['capital_social'] = "NAO PREENCHIDO" qsa = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr') if qsa: receita['qsa']['quadro_social'] = [] quadros = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr') for k in range(1, (len(quadros))): tmpQuadro = {} nome_empresarial = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr['+str(k)+']/td/fieldset/table/tbody/tr/td[1]/table/tbody/tr[1]/td[2]/text()').extract() qualificacao = Selector(text=self.html).xpath('/html/body/table[3]/tbody/tr/td/table[3]/tbody/tr['+str(k)+']/td/fieldset/table/tbody/tr/td[1]/table/tbody/tr[2]/td[2]/text()').extract() if len(nome_empresarial) > 0: tmpQuadro["nome_empresarial"] = nome_empresarial[0].strip(' \r\n\t') if len(qualificacao) > 0: tmpQuadro["qualificacao"] = qualificacao[0].strip(' \r\n\t') receita['qsa']['quadro_social'].append(tmpQuadro) else: receita['qsa']['quadro_social'] = "A NATUREZA JURIDICA NAO PERMITE O PREENCHIMENTO DO QSA"; return receita
def parse(self, response): url = response.url print url if url in self.url_already_scrapy: print url, "已经爬取过" return selector = Selector(response) print selector.extract() return self.url_already_scrapy.append(url) video = Video('new video') if url.startswith("http://v.xxxiao.com"): self.parseXXXiao(response, video) elif url.startswith("http://www.meipai.com"): self.parseMeipai(response, video) self.saveVideo(video)
def parse_ph_info(self, response): vodeo_url = response.xpath('//*[@id="player"]/div[21]/video/source/@src').extract_first() phItem = PornVideoItem() phItem['file_urls'] = [vodeo_url] selector = Selector(response) # logging.info(selector) _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract()) logging.debug('PH信息的JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url) yield phItem
def parse(self, response): selector = Selector(response) information_item = InformationItem() uid_from_url = re.findall('(\d+)/info', response.url) if uid_from_url: information_item['_id'] = re.findall( '(\d+)/info', response.url)[0] # get user id else: information_item['_id'] = "NA" information_item['page_url'] = response.url.replace( self.base_url, self.weibo_baseurl) information_item['page_raw'] = selector.extract( ) # get raw page content information_item['crawl_time_utc'] = dt.utcnow() yield information_item # request tweets page if uid_from_url: yield Request(url=self.base_url + '/{}/profile?page=1'.format(information_item['_id']), callback=self.parse_tweet, meta={'user_id': information_item['_id']}, priority=1) else: yield Request(url=response.url + '?page=1', callback=self.parse_tweet, meta={'user_id': information_item['_id']}, priority=1)
def parse_item(self, response): page_num = self.get_page_num_from_url(response.url) crawl_time = date_str_now_ymd() #当前时间 if response.status != 200: print '#' * 100 print str(response.status) print '#' * 100 print str(response.body) print '#' * 100 #todo yield handle_captcha(self, response) selector = Selector(response) duan_list = [] for dz_selector in selector.xpath("//div[@class='row']"): dz_selector = Selector(text=dz_selector.extract()) duan_list.append( self.gen_new_duanzi_item_by_selector(dz_selector) ) yield self.gen_new_duanzi_item_by_selector(dz_selector) next_url = "".join(selector.xpath( "/html/body/div[@id='wrapper']/div[@id='body']/div[@id='content']/div[@id='comments']/div[@class='comments'][1]/div[@class='cp-pagenavi']/a[@class='previous-comment-page']/@href" ).extract()) if next_url: yield self.gen_next_request(next_url) print 50*'*' pass
def parse2(self, response): try: timeout = WebDriverWait(self.driver, 10) except: print("Timed out waiting for page load.") self.driver.quit() title = Selector(response).xpath( '//div[@class="leftContainer"]/div/div/div/div/ \ a/img[@id="coverImage"]/@alt') genre = Selector( response).xpath('//div[@class="rightContainer"]/div/div/ \ div[@class="bigBoxBody"]/div/div/div[@class="left"]/a/text()' ) rating = Selector(response).xpath( '//div[@class="leftContainer"]/div/div[@id="metacol"]/ \ div[@id="bookMeta"]/span/span[@class="average"]/text()') reviews = Selector(response).xpath('//div[@id="bookReviews"]/ \ div[@class="friendReviews elementListBrown"]') for review in reviews: try: item = GoodreadsItem() item['title'] = title.extract()[0] item['rating'] = rating.extract()[0] item['book_url'] = response.meta['book_url'] item['genre'] = genre.extract()[0] item['link_url'] = review.xpath( './/div/div/link/@href').extract()[0] item['reviewDate'] = review.xpath( './/div/div/div/div/a/text()').extract()[0] item['user'] = review.xpath( './/div/div/div/div/span/a/text()').extract()[0] review_text = review.xpath('.//div/div/div/ \ div[@class="reviewText stacked"]/span/ \ span[2]/text()').extract()[0] # remove html tags item['review'] = remove_tags(review_text) except IndexError as e: print(e, ": title: ", item['title'], "user: ", item['user']) logger.error(e.args[0]) raise yield item
def extract_entry_destination(self, response): trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]//table[1]//tr[3]" destination = Selector(response=response).xpath(trip_info_xpath + "//*[contains(text(), 'Destination')]") if destination: destination = destination.extract() return destination else: return None
def extract_entry_start_loc(self, response): trip_info_xpath = "/html/body/table//tr[4]/td/table/tr//td[2]//table[1]//tr[3]" start_loc = Selector(response=response).xpath(trip_info_xpath + "//*[contains(text(), 'Starting Location')]") if start_loc: start_loc = start_loc.extract() return start_loc else: return None
def parse_start_url(self,response): item = StockItem() sel = Selector(response) log.msg(response.url, level=log.INFO) log.msg(response.encoding, level=log.INFO) item['stocks'] = sel.extract() log.msg(item['stocks'], level=log.INFO) return item
def parse(self, response): items = [dict(film_name='电影名称',film_type='电影类型',plan_date='上映日期')] print(response.encoding) response=response.text.replace("<dd>","</dd><dd>") for i in range(1, 11): item = SpidersItem() film_name = Selector(text=response).xpath(f'//*[@id="app"]/div/div[2]/div[2]/dl/dd[{i}]/div[1]/div[2]/a/div/div[1]/span[1]/text()') film_type = Selector(text=response).xpath(f'//*[@id="app"]/div/div[2]/div[2]/dl/dd[{i}]/div[1]/div[2]/a/div/div[2]/text()') plan_date = Selector(text=response).xpath(f'//*[@id="app"]/div/div[2]/div[2]/dl/dd[{i}]/div[1]/div[2]/a/div/div[4]/text()') # 设置item item['film_name'] = film_name.extract_first().strip() item['film_type'] = film_type.extract()[1].strip() # print(film_type.extract()[1].strip()) item['plan_date'] = plan_date.extract()[1].strip() # print(plan_date.extract()[1].strip()) items.append(item) return items
def parse_ph_info(self, response): ph_item = PornVideoItem() selector = Selector(response) # logging.info(selector) _ph_info = re.findall('var flashvars =(.*?),\n', selector.extract()) logging.debug('PH信息的JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) image_url = _ph_info_json.get('image_url') duration = _ph_info_json.get('video_duration') title = _ph_info_json.get('video_title') link_url = _ph_info_json.get('link_url') quality_480p = _ph_info_json.get('quality_480p') ph_item['video_duration'] = duration ph_item['video_title'] = title ph_item['image_url'] = image_url ph_item['link_url'] = link_url ph_item['quality_480p'] = quality_480p sha1_object = sha1() sha1_object.update(quality_480p) file_sha1 = sha1_object.hexdigest() # 检查这个文件有没有下载过了 image_file_name = os.path.join(self.file_dir, file_sha1 + '.jpg') mp4_file_name = os.path.join(self.file_dir, file_sha1 + '.mp4') if os.path.exists(mp4_file_name): ph_item['exists'] = True yield ph_item else: ph_item['exists'] = False ph_item['video_file_path'] = mp4_file_name ph_item['image_file_path'] = image_file_name # urllib.urlretrieve(image_url, image_file_name) curl = pycurl.Curl() # curl.setopt(pycurl.USERAGENT,response.headers["User-Agent"]) curl.setopt(pycurl.URL, image_url) curl.setopt(pycurl.REFERER, response.url) curl.setopt(pycurl.SSL_VERIFYPEER, 1) curl.setopt(pycurl.SSL_VERIFYHOST, 2) curl.setopt(pycurl.WRITEDATA, file(image_file_name, "wb")) curl.perform() curl.close() curl2 = pycurl.Curl() curl2.setopt(pycurl.URL, quality_480p) curl2.setopt(pycurl.REFERER, response.url) curl2.setopt(pycurl.SSL_VERIFYPEER, 1) curl2.setopt(pycurl.SSL_VERIFYHOST, 2) curl2.setopt(pycurl.WRITEDATA, file(mp4_file_name, "wb")) curl2.perform() curl2.close() # urllib.urlretrieve(quality_480p, mp4_file_name) yield ph_item
def parse_all_content(self, response): # 有阅读全文的情况,获取全文 selector = Selector(response) tweet_item = TweetItem() tweet_item['_id'] = self.get_tweet_id(response.url) tweet_item['user_id'] = response.meta["user_id"] tweet_item['page_url'] = response.url tweet_item['page_raw'] = selector.extract() # get raw page content tweet_item['crawl_time_utc'] = dt.utcnow() yield tweet_item
def parse_project(self, response): common.dump_response(self.settings, response) json = response.meta['json'] sel = Selector(response) item = KickstarterItem() item['title'] = json['name'] item['currency'] = json['currency'] item['goal'] = float(json['goal']) item['date'] = int(json['deadline']) # Remove html tags from description here since we're in the scrapy context and have relevant utilities item['rawtext'] = ' '.join( map(lambda sel: sel.extract(), sel.xpath('//div[@class="full-description"]//text()')) ) + ' ' + ' '.join( map(lambda sel: sel.extract(), sel.xpath('//div[@class="short_blurb"]//text()'))) item['web'] = response.url return [item]
def parse_interview_info(self, response): interviewItem = InterviewItem() selector = Selector(response) logging.info(selector) _interview_info = re.findall('(.*?),\n', selector.extract()) logging.debug('信息的JSON:') logging.debug(_interview_info) _ph_info_json = json.loads(_interview_info[0]) duration = _ph_info_json.get('video_duration') logging.info('duration:' + duration) yield interviewItem
def parse(self, response): if self.user_db == None: self.user_db = leveldb.LevelDB('./user_db') match = self.p.match(response.url) current_page = int(match.group(1)) print('current_page =', current_page) body = response.body els = Selector( text=body).xpath("//table[@class='card-table-material']/tbody")[0] for tr in els.xpath('tr'): tds = tr.xpath('td') if (len(tds) < 2): continue rank = tds[0].xpath('text()').extract()[0].strip() name_node = tds[1].xpath("a[@data-tooltip='notooltip']") href = name_node.xpath('@href').extract()[0].strip() id = name_node.xpath('text()').extract()[0].strip() rating = tds[2].xpath( "div[@class='pull-right']/text()").extract()[0].strip() n_game = tds[3].xpath('text()').extract()[0].strip() print(rank, id, href, rating, n_game) if args.L == 1: continue if len(self.pubg_api_key) == 0: # no key yield scrapy.Request( url='https://pubgtracker.com/profile/pc/%s/squad?region=as' % (id), callback=self.parse_user) else: yield scrapy.Request( url='https://pubgtracker.com/api/profile/pc/%s' % (id), headers={'TRN-Api-Key': self.pubg_api_key}, callback=self.parse_user_api) next_page = Selector(text=body).xpath("//a[@class='next next-page']") print(next_page.extract()) if len(next_page.xpath('@disabled')) == 0: # do next page url_patern = 'https://pubgtracker.com/leaderboards/pc/Rating?page=%d&mode=3®ion=3' next_url = url_patern % (current_page + 1) print('next_page =', next_url) if has_we_chat: if current_page % 5 == 0: itchat.send(u'Working on ' + next_url, 'filehelper') if current_page < args.N: if current_page == -1: yield scrapy.Request(url=next_url, meta={"dont_cache": True}, callback=self.parse) else: yield scrapy.Request(url=next_url, callback=self.parse) return
def do(self, input_, **kwargs): if isinstance(input_, Response) or isinstance(input_, SelectorList): res = input_.css(self._path) elif isinstance(input_, basestring): res = Selector(text=input_).css(self._path) else: raise Exception(__name__ + ' unknow type of argument' + str(type(input_))) if not self._return_selector: res = res.extract() if not self._return_multi: res = res[0] if res else None return res
def parse_project(self, response): common.dump_response(self.settings, response) json = response.meta['json'] sel = Selector(response) item = KickstarterItem() item['title'] = json['name'] item['currency'] = json['currency'] item['goal'] = float(json['goal']) item['date'] = int(json['deadline']) # Remove html tags from description here since we're in the scrapy context and have relevant utilities item['rawtext'] = ' '.join(map( lambda sel: sel.extract(), sel.xpath('//div[@class="full-description"]//text()') )) + ' ' + ' '.join(map( lambda sel: sel.extract(), sel.xpath('//div[@class="short_blurb"]//text()') )) item['web'] = response.url return [item]
def parse_torrent(self, response): sel = Selector(response) _text = lambda sel: sel.extract()[0].strip().replace(u'\xa0', ' ') url = response.url name = _text(sel.xpath('//h1/text()')) description = _text(sel.xpath('//div[@id="description"]/text()[2]')) size = _text(sel.xpath('//div[@id="specifications"]/p[2]/text()[2]')) torrent = TorrentItem() torrent['url'] = url torrent['name'] = name torrent['description'] = description torrent['size'] = size return torrent
def parse(self, response): reload(sys) sys.setdefaultencoding('utf-8') page_bar = Selector(response).xpath('//div[@class="zm-invite-pager"]/span/a/text()') if not page_bar: print "error" return page_nums = page_bar.extract() max_page_num = int(page_nums[-2]) for page_num in range(8000, max_page_num): url = response.url + "?page=" + str(page_num) yield Request(url, callback=self.topic_handler)
def parse(self, response): rows = response.selector.xpath('//tbody/tr[starts-with(@id, "row")]') N = len(rows) print('total = ' + repr(N)) approvals = [] for i in range(N): rowex = Selector(text=rows[i].extract()).xpath('//nobr/text()') data = rowex.extract() if data[8] == 'approved' and data[3].startswith('Potomac'): for v in data: if v.endswith('days'): elapsed = v approvals.append([data[3], data[4], elapsed]) if len(approvals) > 0: yield {"data": approvals}
def parse(self, response): name_xpath = '//div[@class="infoAdd"]/text()' log_xpath = '//div[@class="infotxt"]/text()' for s in response.selector.xpath('//ul[@id="infoWind"]/li'): s = Selector( text=re.sub(r'[\n\t]', '', s.extract(), flags=re.MULTILINE)) station = items.Station() station['name'] = s.xpath(name_xpath).extract()[0] yield station log = items.StationLog() log['station'] = station['name'] log_extracted = s.xpath(log_xpath).extract() log['available_bikes'] = int(log_extracted[0].replace(': ', '')) log['empty_docks'] = int(log_extracted[1].replace(': ', '')) log['crawl_date'] = datetime.now() yield log
def parse(self, response): # 打印网页url print(response.url) # 打印网页内容 # print(response.text) urls = Selector(response=response).xpath( '//div[@class="channel-detail movie-item-title"]/a/@href') print("urls:", urls[:10]) urls = urls.extract() urls = urls[:10] print("urls:", urls) for url in urls: time.sleep(1) link = url_prefix + url yield scrapy.Request(url=link, callback=self.parse2)
def parse(self, response): current_page = response.url.split("&")[-1].split("=")[-1] current_page = int(current_page) #print("[DEBUG] current_page:" + str(current_page)) print("[DEBUG] response.url:" + str(response.url)) selector = Selector(response) searchpage_item = SearchPageItem() searchpage_item['page_url'] = re.sub("https://.*?/fireprox", self.weibo_baseurl, response.url) searchpage_item['page_raw'] = selector.extract( ) # get raw page content searchpage_item['search_key'] = searchpage_item['page_url'].split( "&")[0].split("=")[-1] searchpage_item['sort_setting'] = searchpage_item['page_url'].split( "&")[1].split("=")[-1] searchpage_item['filter_setting'] = searchpage_item['page_url'].split( "&")[2].split("=")[-1] searchpage_item['crawl_time_utc'] = dt.utcnow() yield searchpage_item # print("[DEBUG] page content:" + searchpage_item['page_raw']) # print("[DEBUG] original url:" + searchpage_item['page_url']) tree_node = etree.HTML(response.body) tweet_nodes = tree_node.xpath('//div[@class="c" and @id]') if len(tweet_nodes) == 0 and current_page != 1: if response.meta["empty_page_count"] > 0: empty_page_count = response.meta["empty_page_count"] + 1 else: empty_page_count = 1 else: empty_page_count = 0 if empty_page_count != 3: next_page = current_page + 1 page_url = re.sub("https://.*?/fireprox", self.get_base_url(), response.url) page_url = page_url.replace('page=' + str(current_page), 'page={}'.format(next_page)) yield Request(page_url, self.parse, dont_filter=True, meta={'empty_page_count': empty_page_count}, priority=1)
def parse_m4a_url(self, response): phItem = Lang151Item() selector = Selector(response) #logging.debug('m4a_url=====>>>>>>>>>>>>>>>>>>>>>'+selector.extract()) _ph_info = re.findall('http://audio(.*?)\'', selector.extract()) url = 'http://audio' + _ph_info[0] if url.find('.m4a') == -1: url = url + '.m4a' logging.debug('播放地址信息的JSON:=====>>>>>' + url) file_dir = "/home/david/download_m4a/m4a/" if not os.path.exists(file_dir): os.makedirs(file_dir) dm = download_m3u8() dm.start_download(url, file_dir, format(self.file_name) + '.m4a') self.file_name += 1 yield phItem
def parse_ph_info(self, response): phItem = PornVideoItem() selector = Selector(response) _ph_info = re.findall('flashvars_.*?=(.*?);\n', selector.extract()) logging.debug('PH信息的JSON:') logging.debug(_ph_info) _ph_info_json = json.loads(_ph_info[0]) duration = _ph_info_json.get('video_duration') phItem['video_duration'] = duration title = _ph_info_json.get('video_title') phItem['video_title'] = title image_url = _ph_info_json.get('image_url') phItem['image_url'] = image_url link_url = _ph_info_json.get('link_url') phItem['link_url'] = link_url quality_480p = _ph_info_json.get('quality_480p') phItem['quality_480p'] = quality_480p logging.info('duration:' + duration + ' title:' + title + ' image_url:' + image_url + ' link_url:' + link_url) yield phItem
def parse_doctor(self, response): response_url = response.url doctor_id = re.search("doctor/([^\.]*)\.htm", response_url).group(1) hxs = Selector(response) f.write(hxs.extract()) # parse doctor name name_list = hxs.xpath("//input[@name='doctor_name']/@value") doctor_name = "" if len(name_list) != 0: doctor_name = name_list[0].extract() # hospital department hospital = "" department = "" hd_selectors = hxs.xpath("//div[@class='luj']") if len(hd_selectors) != 0: hospital_selectors = hd_selectors.xpath("a[contains(@href, '/hospital/')]/text()") if len(hospital_selectors) == 1: hospital = hospital_selectors[0].extract() department_selectors = hd_selectors.xpath("a[contains(@href, '/faculty/')]/text()") if len(department_selectors) == 1: department = department_selectors[0].extract() # hospital_department_selectors = hxs.xpath("//meta[@name='keywords']/@content") # hospital_department_selectors2 = hxs.xpath("//html") # print "------------------hospital length: %s" % len(hospital_department_selectors2) # if len(hospital_department_selectors) != 0: # hospital_re = r',(?P<hospital>.*?)' + doctor_name # hospital_match = re.search(hospital_re, hospital_department_selectors[0].extract()) # if hospital_match != None: # hospital = hospital_match.group('hospital') # # department_re = hospital + r'(?P<department>.*?)' + doctor_name + ',' # department_match = re.search(department_re, hospital_department_selectors[0].extract()) # if department_match != None: # department = department_match.group('department') # disease disease_poll_count_re = re.compile(u"(?P<poll_count>\d+)票") disease_polls = [] disease_selectors = hxs.xpath('//div[@class="ltdiv"]//a') for ds in disease_selectors: ds_name_selectors = ds.xpath("text()") if len(ds_name_selectors) != 0: ds_dict = dict() ds_dict["name"] = ds_name_selectors[0].extract() poll_selectors = ds.xpath("following-sibling::text()") if len(poll_selectors) != 0: poll_count_match = disease_poll_count_re.search(poll_selectors[0].extract()) if poll_count_match: ds_dict["count"] = poll_count_match.group("poll_count") disease_polls.append(ds_dict) # title title = "" title_selectors = hxs.xpath(u"//td[text()='职 称:']/following-sibling::td/text()") if len(title_selectors) == 1: title_re = re.compile(u"(?P<title>\S+医师)") title_match = title_re.search(title_selectors[0].extract()) if title_match: title = title_match.group("title") # if title == '': # title_selectors = hxs.xpath('//meta[@name="description"]/@content') # if len(title_selectors) != 0: # title_re_str = doctor_name + r'(?P<doctor_title>.*?)' + u'简介' # title_found = re.search(title_re_str, title_selectors[0].extract()) # if title_found: # title = title_found.group(1) # parse in js: most condition # bp_doctor_about_js_lists = hxs.xpath('//script[@type="text/javascript"]/text()').re(r'BigPipe.onPageletArrive\(\{"id":"bp_doctor_about".*') # personal_image image_url = "" # if len(bp_doctor_about_js_lists) != 0: # image_re = re.compile(r'center.*?src=\\"(?P<img_js_url>.*?)\\"') # image_match = image_re.search(bp_doctor_about_js_lists[0]) # if image_match != None: # image_url = image_match.group(1).replace('\/', '/') # parse in div if image_url == "": image_selectors = hxs.xpath("//div[@class='ys_tx']/table//tr/td/img/@src") if len(image_selectors) != 0: image_url = image_selectors[0].extract() feature = "" bio = "" # bio div full # //td/div[@id="full"] if bio == "": bio_selectors = hxs.xpath('//div[@id="full"]') if len(bio_selectors) != 0: special_img_selectors = bio_selectors.xpath(".//img") if len(special_img_selectors) == 0: # no special img content = bio_selectors[0].extract() bio_re = re.compile(r'id="full"[^>]*>[/s]*(.*?)<span>', re.S) bio_match = bio_re.search(content) if bio_match != None: bio = bio_match.group(1) bio = bio.replace("<br>", "\n") comment_re = re.compile("<!--.*?-->") bio = comment_re.sub("", bio) # if len(bp_doctor_about_js_lists) != 0: # content = bp_doctor_about_js_lists[0] # nr_re = re.compile(r'\\n|\\r|\\t|\xa0') # content = nr_re.sub('', content) # # #bio js full # if bio == "": # if len(bp_doctor_about_js_lists) != 0: # js_special_img_re = re.compile('giftrans') # js_special_img_match = js_special_img_re.search(bp_doctor_about_js_lists[0]) # if js_special_img_match == None: # no replacement image # bio_re = re.compile(r'<div id=\\"full\\"[^>]*>(?P<bio>.*?)<span>', re.S) # bio_match = bio_re.search(content) # if bio_match != None: # bio = bio_match.group(1)#.decode('unicode_escape') # # #bio js truncate: not contain giftrans # if bio == "": # bio_re = re.compile(r'div[ ]id=\\"truncate\\"[^>]*>[\s]*(?P<bio>.*?)[\s]*<span>', re.S) # bio_match = bio_re.search(content) # if bio_match != None: # bio = bio_match.group(1) # #bio js not full or truncate # if bio == "": # bio_re = re.compile(r'\\u6267\\u4e1a\\u7ecf\\u5386\\uff1a.*?<td[^>]*>(?P<bio_content>.*?)<br', re.S) # <!--.?--> only in full bio # bio_match = bio_re.search(content) # if bio_match != None: # bio = bio_match.group(1) # # #bio filter # if bio != "": # bio = bio.replace('<br \/>', '\n') # comment_re = re.compile('<!--.*?-->') # bio = comment_re.sub('', bio) # bio = bio.decode('unicode_escape') # print bio # # #feature, bio js truncate # if feature == "": # feature_re = re.compile(r'truncate_DoctorSpecialize[^>]*>[\s]*(?P<feature>[\S]*)\s') # feature_match = feature_re.search(content) # if feature_match != None: # feature = feature_match.group(1).decode('unicode_escape') # feature bio fetch through div if feature == "": feature_selectors = hxs.xpath('//div[@id="truncate_DoctorSpecialize"]/text()') if len(feature_selectors) != 0: feature = feature_selectors[0].extract() # bio div truncate if bio == "": bio_selectors = hxs.xpath('//div[@id="truncate"]/text()') if len(bio_selectors) != 0: bio = bio_selectors[0].extract() # bio not in truncate div, in td # no div[@id='truncate'] or div[@id='full'] exists in such condition # if bio == "": # bio_xpath = '//td[text()="' + u'执业经历:' + '"]/parent::*/td[3]/text()' # bio_selectors = hxs.xpath(bio_xpath) # if len(bio_selectors) != 0: # for bio_sel in bio_selectors: # bio += bio_sel.extract() if bio == "": bio_xpath = '//td[text()="' + u"执业经历:" + '"]/parent::*/td[3]' bio_selectors = hxs.xpath(bio_xpath) if len(bio_selectors) != 0: bio_re = re.compile("<td.*?>(?P<bio_content>.*?)<br", re.S) bio_match = bio_re.search(bio_selectors[0].extract()) if bio_match != None: bio = bio_match.group(1) zhanwu_re = re.compile(u"暂无") if zhanwu_re.search(bio) != None: bio = "" if zhanwu_re.search(feature) != None: feature = "" # format filter format_filter_re = re.compile(r"(<a .*?>|<\\/a>|\n|\t|\r|\\| )") if bio != "": bio = format_filter_re.sub("", bio) if feature != "": feature = format_filter_re.sub("", feature) # schedule doctor_schedule = [] trs = hxs.xpath("//table[@class='doctortimefrom1']//tr") day_part = 0 for itr in trs: if 0 != day_part: doctor_schedule.extend(self.weekday_operation(itr, day_part)) # 上午 day_part += 1 item = DoctorDetailItem() item["doctor_id"] = doctor_id item["_name"] = doctor_name item["city"] = response.meta["city"] item["hospital"] = hospital item["department"] = department item["title"] = title item["schedule"] = doctor_schedule item["feature"] = feature item["bio"] = bio if image_url: item["image"] = image_url if disease_polls: item["disease"] = disease_polls yield item
def parse_item(self, response): item = zillowItem() req = {} # Get house address # try: address = Selector(response=response).xpath('//header[@class = "zsg-content-header addr"]/h1/text()').extract()[0].split(',')[0] req['address'] = address except: req['address'] = None # Get street number from address # try: street = re.findall('\s(\w+)\sSt',address)[0] req['street'] = street except: req['street'] = None try: avenue = re.findall('\s(\w+)\sAve',address)[0] req['avenue'] = avenue except: req['avenue'] = None try: avenue = re.findall('\s(\w+)\sRd',address)[0] req['road'] = avenue except: req['road'] = None # Get sale price # try: sale_price = Selector(response=response).xpath('//div[@class = "main-row home-summary-row"]/span/text()') if sale_price: req['sale_price'] = int(re.sub('\W+',"",sale_price.extract()[0])) else: sale_price = Selector(response=response).xpath('//div[@class = "main-row status-icon-row recently-sold-row home-summary-row"]/span/text()').extract()[0] req['sale_price'] = int(re.sub('\W+',"",sale_price)) except: req['sale_price'] = None # Get address specifics # try: address = Selector(response=response).xpath('//span[@class = "zsg-h2 addr_city"]/text()').extract()[0] groups = re.findall('(.+)\,\s(\w{2})\s(\d{5})',address)[0] req['city'] = groups[0] req['state'] = groups[1] req['zipcode'] = groups[2] except: req['city'] = None req['state'] = None req['zipcode'] = None # Get rooms types # links1 = Selector(response=response).xpath('//span[@class = "addr_bbs"]/text()').extract() try: k = re.search('Studio',links1[0]) j = re.search('(\d+)',links1[0]) if k != None: req['beds'] = 0.5 elif j != None: req['beds'] = int(re.findall('(\d+)',links1[0])[0]) else: req['beds'] = None except: req['beds'] = None try: req['baths'] = int(re.findall('(\d+)',links1[1])[0]) except: req['baths'] = None # Get house details # links2 = Selector(response=response).xpath('//ul[@class = "zsg-list_square zsg-lg-1-3 zsg-md-1-2 zsg-sm-1-1"]') item['title'] = [] for link in links2: item['title'].extend(link.xpath('li/text()').extract()) info2 = item['title'] info3 = {} info4 = [] for items in info2: try: if ':' in items: x = re.split(':\s',items) info3[x[0]] = x[1] else: info4.append(items) except: continue item['title'] = cleanDicts(info3) item['title'].update(cleanSingleLines(info4)) item['title'].update(req) # Return all the acquired information # yield item