def crawlListPage(self): print '开始抓取列表页' self.openPage( "http://hotel.elong.com/nanjing/" ) # 记录每页的循环次数(初始值为0) loop_num = 0 # 标识页面是否已经爬取:False为未处理,反之为已处理 if_handle = False # 总页面数 page_num = 0 hotel_num = int(self.driver.find_element_by_xpath("//span[@class='t24 mr5']").text) if hotel_num % 20==0: page_num = hotel_num/20 else: page_num = hotel_num/20 + 1 # 测试 抓取5页 #page_num = 5 while page_num>=1: loop_num += 1 self.driver.find_element_by_tag_name("body").send_keys(Keys.END) #self.driver.find_element_by_tag_name("body").send_keys(Keys.PAGE_UP) if u"返后价" in self.driver.page_source: if if_handle == False: self.__parseUrls(self.driver.page_source) print u"获取酒店数为:%d" % len(self.listPageInfo) if_handle = True try: #判断是否在加载,若在加载,就等0.1s response = HtmlResponse(url="My HTML String",body=self.driver.page_source,encoding="utf-8") _loading = response.xpath("//div[@id='_loading_']/@style").extract() while 1: if _loading == []: break if u'none' in _loading[0]: break else: #print '正在加载中......' time.sleep(0.1) response = HtmlResponse(url="My HTML String",body=self.driver.page_source,encoding="utf-8") _loading = response.xpath("//div[@id='_loading_']/@style").extract() if u"下一页" in self.driver.page_source: self.driver.find_element_by_xpath("//div[@class='paging1']/a[@class='page_next']").click() page_num -= 1 if_handle = False loop_num = 0 time.sleep(random.uniform(1, 3)) except Exception, e: print "error happen at clicking next-page" print e if loop_num != 0: if loop_num < 15: time.sleep(1) continue else: break
def parse_kb(self, response): mib = None # need to perform some nasty segmentation because different firmware versions are not clearly separated # reverse order to get MIB before firmware items for entry in reversed(response.xpath( "//div[@id='support-article-downloads']/div/p")): for segment in reversed(entry.extract().split("<br><br>")): resp = HtmlResponse( url=response.url, body=segment, encoding=response.encoding) for href in resp.xpath("//a/@href").extract(): text = resp.xpath("//text()").extract() if "MIBs" in href: mib = href elif "firmware" in href: text = resp.xpath("//text()").extract() item = FirmwareLoader( item=FirmwareImage(), response=resp, date_fmt=["%m/%d/%Y"]) item.add_value("date", item.find_date(text)) item.add_xpath("url", "//a/@href") item.add_value("mib", mib) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) item.add_value( "version", FirmwareLoader.find_version_period(text)) yield item.load_item()
def parse_kb(self, response): # initial html tokenization to find regions segmented by e.g. "======" # or "------" filtered = response.xpath( "//div[@class='sfdc_richtext']").extract()[0].split("=-") for entry in [x and x.strip() for x in filtered]: resp = HtmlResponse(url=response.url, body=entry, encoding=response.encoding) for link in resp.xpath("//a"): href = link.xpath("@href").extract()[0] if "cache-www" in href: text = resp.xpath("//text()").extract() text_next = link.xpath("following::text()").extract() item = FirmwareLoader(item=FirmwareImage(), response=response, date_fmt=["%b %d, %Y", "%B %d, %Y", "%m/%d/%Y"]) version = FirmwareLoader.find_version_period(text_next) if not version: version = FirmwareLoader.find_version_period(text) item.add_value("version", version) item.add_value("date", item.find_date(text)) item.add_value("url", href) item.add_value("product", response.meta["product"]) item.add_value("vendor", self.name) yield item.load_item()
def __crawlHotelComment(self,driver,hotel_id ,pagenum): pagenum = int(pagenum) # 遍历所有页 while pagenum>=1: response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8") loading = response.xpath("//div[@id='commentLoading']/@style").extract()[0] #当加载不显示时,才爬取 while loading!=u'display: none;': print '正在加载......' time.sleep(0.1) response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8") loading = response.xpath("//div[@id='commentLoading']/@style").extract()[0] itemlist = response.xpath("//ul[@class='dcomt_list']/li") for item in itemlist: username = item.xpath(".//div[@class='dcomt_head left']/div[2]/span/text()").extract()[0] remarkText = item.xpath(".//p[@class='dcomt_con_txt']/text()").extract()[0] #TODO 过滤 非中文字符 待修改 remarkText = remarkText.encode("gbk",'ignore') remarkText = remarkText.decode("gbk") remark = '' for string in remarkText: remark = remark + re.sub("\s+", "", string) user_type = item.xpath(".//div[@class='dcomt_head_pic']/p/text()").extract()[0] comm_time = item.xpath(".//span[@class='dcomt_con_time']/text()").extract()[0] goodorbad = item.xpath(".//p[@class='mb5']/i/@class").extract()[0] comm_type = '' if u'good' in goodorbad: comm_type = "值得推荐" if u'bad' in goodorbad: comm_type = "有待改善" senti_value = self.hotelNLP.sentiment(remark.encode("utf-8")) viewpoint = json.dumps(self.hotelNLP.viewpoint(remark.encode("utf-8"),decoding="utf-8")) comm ={ "guid":uuid.uuid1(), "username":username, "remark":remark, "comm_time":comm_time, "user_type":user_type, "comm_type":comm_type, "senti_value":senti_value, "viewpoint":viewpoint, "baseinfo_id":hotel_id } if self.__is_exist_in_comment_list(comm) is False: self.commList.append(comm) else: #print comm['remark'] pass if pagenum == 1: break #点下一页 self.scroll_and_click_by_xpath("//div[@id='comment_paging']/a[@class='page_next']") pagenum -= 1 time.sleep(random.uniform(1,4)) print pagenum return True
def __parseUrls(self, page_source): response = HtmlResponse(url="my HTML string",body=page_source,encoding="utf-8") # 抽取出每页中的酒店url存储到urlList中 urlList = response.xpath("//a[@class='name']/@href").extract() commnumList = response.xpath("//div[@class='comment']/a/span/text()").extract() name_list = response.xpath("//a[@class='name']/text()").extract() if len(urlList) == len(commnumList) == len(name_list): for i in range(0,len(urlList)): self.listPageInfo.append({ "guid":uuid.uuid1(), "url":urlList[i], "hotel_name":name_list[i], "OTA":"途牛", "comm_num":int(commnumList[i]), })
def parse_reviews( self, response: HtmlResponse, product_id: int ) -> Iterable[Item]: reviews: List[HtmlResponse] = response.xpath(self.REVIEWS_LIST_XPATH) for review in reviews: rating: int = len( review.xpath(self.RATING_SELECTED_STARS_XPATH).getall() ) time: str = review.xpath(self.TIMESTAMP_XPATH).get("") timestamp: float = ( mktime( datetime.strptime(time, self.TIMESTAMP_FORMAT).timetuple() ) if time else 0.0 ) text: str = review.xpath(self.TEXT_XPATH).get("") size: str = review.xpath(self.SIZE_XPATH).get(": ").split(": ")[-1] color: str = ( review.xpath(self.COLOR_XPATH).get(": ").split(": ")[-1] ) yield ReviewItem( product_id=product_id, rating=rating, timestamp=timestamp, text=text, size=size, color=color, )
def parse(self, response): print "test point" response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body) url = response.url #first_active = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a').extract() # active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div/h2/a/text()').extract() active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div') file_obj = open('collection_now.log', 'w') for active_block in active_page_list: #active = active_block.xpath('.//div[1]/text()').extract()[1].strip() #question = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/text()').extract() #answer_link = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/@href').extract()[0] #if 'http' not in answer_link: # answer_link = "http://www.zhihu.com" + answer_link question = active_block.xpath('.//h2/a/text()').extract()[0] # answer_link = active_block.xpath('.//div/div[1]/div[4]/div/a[@href="toggle-expand"]/@href').extract() answer_link = active_block.xpath( './/div/div[1]/div[4]/div/a/@href').extract() if len(answer_link) > 0: if 'http' not in answer_link[0]: answer_link_str = "http://www.zhihu.com" + answer_link[0] # print question, answer_link_str file_obj.write( question.encode('utf-8') + '\t' + answer_link_str.encode('utf-8') + '\n') # file_obj.write('\n') file_obj.close()
def vacansy_parce(self, response: HtmlResponse): link = response.url name = response.xpath("//h1/text()").extract_first() salary = response.xpath( "//span[@class='_3mfro _2Wp8I ZON4b PlM3e _2JVkc']/text()" ).extract() company_name = response.xpath( "//h2[@class='_3mfro PlM3e _2JVkc _2VHxz _3LJqf _15msI']/text()" ).extract() company_address = response.xpath( "//span[@class='_3mfro _1hP6a _2JVkc']/text()").extract_first() yield JobparserItem(name=name, salary=salary, company_name=company_name, company_address=company_address, link=link)
def parse(self, response: HtmlResponse): next_page = response.xpath("//a[text()='Далее']/@href").extract_first() if next_page: yield response.follow(next_page, callback=self.parse) book_links = response.css('a.book__image-link::attr(href)').extract() for link in book_links: yield response.follow(link, callback=self.book_parse)
def parse_vacancy(response: HtmlResponse): item = { 'script': response.xpath( '//script[@type="application/ld+json"]/text()').extract() } yield SuperjobItem(**item)
def parse(self, response): print "test point" response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body) url = response.url #first_active = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a').extract() # active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div/h2/a/text()').extract() active_page_list = response.xpath('//*[@id="zh-list-answer-wrap"]/div') file_obj = open('collection_now.log', 'w') for active_block in active_page_list: #active = active_block.xpath('.//div[1]/text()').extract()[1].strip() #question = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/text()').extract() #answer_link = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/@href').extract()[0] #if 'http' not in answer_link: # answer_link = "http://www.zhihu.com" + answer_link question = active_block.xpath('.//h2/a/text()').extract()[0] # answer_link = active_block.xpath('.//div/div[1]/div[4]/div/a[@href="toggle-expand"]/@href').extract() answer_link = active_block.xpath('.//div/div[1]/div[4]/div/a/@href').extract() if len(answer_link) > 0: if 'http' not in answer_link[0]: answer_link_str = "http://www.zhihu.com" + answer_link[0] # print question, answer_link_str file_obj.write(question.encode('utf-8') + '\t' + answer_link_str.encode('utf-8') + '\n') # file_obj.write('\n') file_obj.close()
def start(self): self.driver.get( 'https://wipo.taleo.net/careersection/wp_2/jobsearch.ftl?lang=en#') self.driver.maximize_window() self.driver.implicitly_wait(30) time.sleep(3) if 'Job' in self.driver.page_source: response = HtmlResponse(url="my HTML string", body=self.driver.page_source, encoding="utf-8") links = response.xpath( '//div[@class="multiline-data-container"]/div/span/a/@href' ).extract() logger.info("WIPO共" + str(len(links)) + "条网页待爬") items = [] for link in links: logger.debug("WIPO待爬岗位: " + "https://wipo.taleo.net" + link) url = 'https://wipo.taleo.net' + link self.driver.get(url) time.sleep(3) item = self._parse(self.driver.page_source, url) if item not in items: logger.debug("页面%s爬取成功" % url) items.append(item) logger.debug("共爬取WIPO岗位数据%d条" % len(items)) saveToCsv = SaveToCsv() saveToCsv.saveWIPOjobs(WIPOPath, items) else: self.start()
def vacansy_parse(self, response: HtmlResponse): name_vac = response.css('h1::text').extract_first() salary_vac = response.xpath("//span[@class='_3mfro _2Wp8I PlM3e _2JVkc']/text()").extract() url_vac = response.url source_vac = 'superjob.ru' yield JobparserItem(name=name_vac, salary=salary_vac, url=url_vac, source=source_vac)
def parse(self, response: HtmlResponse): last_page = response.xpath( "//div[contains(@data-marker,'pagination-button')]/span[contains(@data-marker," "'page')][last()]/text()").extract_first() items = response.xpath( "//div[@itemtype='http://schema.org/Product'] //a[@data-marker='item-title']/@href" ).extract() if int(last_page) > 1: count_page = int(last_page) while count_page > 1: page_url = f'https://www.avito.ru/chelyabinsk/tovary_dlya_kompyutera?cd=2&p={count_page}' count_page -= 1 yield response.follow(page_url, callback=self.parse) for item in items: item_link = f'https://www.avito.ru/{item}' yield response.follow(item_link, callback=self.item_pars)
def parse_2(self, response): cadena_temp_1 = response.body.split("<TABLE CELLSPACING=1>") cadena_temp_1 = cadena_temp_1[1].split("</TABLE>") cadena_temp_1[0] = ('<HTML><BODY><TABLE CELLSPACING=1>' + cadena_temp_1[0] + '</TABLE></BODY></HTML>').lower() response = HtmlResponse(url=response.url, body=cadena_temp_1[0]) #pprint.pprint("++++++++++++++++++++++++++++++") for registro in response.xpath('.//body/table/tbody/tr'): item = Crawler_2Item() if not registro.xpath('td[1]/font/a/text()').extract(): item["date_text"] = "" else: item["date_text"] = registro.xpath( 'td[1]/font/a/text()').extract()[0] if not registro.xpath('td[1]/font/a/@href').extract(): item["date_href"] = "" else: item["date_href"] = registro.xpath( 'td[1]/font/a/@href').extract()[0] if not registro.xpath('td[2]/font/text()').extract(): item["city"] = "" else: item["city"] = registro.xpath('td[2]/font/text()').extract()[0] if not registro.xpath('td[3]/font/text()').extract(): item["state"] = "" else: item["state"] = registro.xpath( 'td[3]/font/text()').extract()[0] if not registro.xpath('td[4]/font/text()').extract(): item["shape"] = "" else: item["shape"] = registro.xpath( 'td[4]/font/text()').extract()[0] if not registro.xpath('td[5]/font/text()').extract(): item["duration"] = "" else: item["duration"] = registro.xpath( 'td[5]/font/text()').extract()[0] if not registro.xpath('td[6]/font/text()').extract(): item["summary"] = "" else: item["summary"] = registro.xpath( 'td[6]/font/text()').extract()[0] if not registro.xpath('td[7]/font/text()').extract(): item["posted"] = "" else: item["posted"] = registro.xpath( 'td[7]/font/text()').extract()[0] #pprint.pprint(item_tabla_2) url_nuevo = 'http://nuforc.org/webreports/' + item["date_href"] item["detalle1"] = "" item["detalle2"] = "" yield scrapy.Request( url_nuevo, body="", method='GET', headers={"content-type": "application/x-www-form-urlencoded"}, dont_filter=True, callback=lambda r: self.parse_3(r, item))
def __parseUrls(self, page_source): response = HtmlResponse(url="My HTML String", body=page_source, encoding="utf-8") hotel_list = response.xpath( "//div[@class='h_list']/div[@class='h_item']") for hotel in hotel_list: url = hotel.xpath(".//p[@class='h_info_b1']/a/@href").extract()[0] name = hotel.xpath( ".//p[@class='h_info_b1']/a/@title").extract()[0] address = hotel.xpath( ".//p[@class='h_info_b2']/text()").extract()[1] commnum = hotel.xpath( ".//div[@class='h_info_comt']/a/span[@class='c555 block mt5']/b/text()" ).extract() if len(commnum) == 0: commnum = 0 else: commnum = commnum[0] self.listPageInfo.append({ "guid": uuid.uuid1(), "url": url, "hotel_name": name, "OTA": self.__ota_info, "comm_num": commnum, "address": address }) pass
def parse(self, response): #pprint.pprint("------------------------------") cadena_temp_1 = response.body.split("<TABLE CELLSPACING=1>") cadena_temp_1 = cadena_temp_1[1].split("</TABLE>") cadena_temp_1[0] = '<HTML><BODY><TABLE CELLSPACING=1>'.lower( ) + cadena_temp_1[0].lower() + '</TABLE></BODY></HTML>'.lower() response_2 = HtmlResponse( url="http://nuforc.org/webreports/ndxevent.html", body=cadena_temp_1[0]) for registro in response_2.xpath('.//body/table/tbody/tr'): item_tabla = CrawlerUfoItem() item_tabla['report_href'] = registro.xpath( 'td[1]/font/a/@href').extract()[0] item_tabla['report_text'] = registro.xpath( 'td[1]/font/a/text()').extract()[0] item_tabla['count'] = registro.xpath( 'td[2]/font/text()').extract()[0] #pprint.pprint(item_tabla) url_nuevo = 'http://nuforc.org/webreports/' + item_tabla[ 'report_href'] yield scrapy.Request( url_nuevo, body="", method='GET', headers={"content-type": "application/x-www-form-urlencoded"}, callback=self.parse_2, dont_filter=True)
def advert_parse(self, response: HtmlResponse): title = response.xpath( '//span[contains(@class, "title-info-title-text")]/text()' ).extract_first() price = response.xpath( '//span[contains(@class, "js-item-price")]/text()').extract_first( ) attrs = response.xpath( '//li[contains(@class, "item-params-list-item")]').extract() clean_attrs = [] for attr in attrs: clean_attrs.append(self.remove_html_tags(attr)) yield AvitoparseItem(title=title, price=price, attrs=clean_attrs)
def __crawllianjie(self, page_sourse): response = HtmlResponse(url="my HTML string", body=page_sourse, encoding="utf-8") hotel_list = response.xpath("//div[@class='searchresult_list ']/ul") for hotel in hotel_list: url = hotel.xpath( "li[@class='searchresult_info_name']/h2/a/@href").extract()[0] address = hotel.xpath( "li[@class='searchresult_info_name']/p[@class='searchresult_htladdress']/text()" ).extract()[0] commnum = hotel.xpath( "li[@class='searchresult_info_judge ']/div/a/span[@class='hotel_judgement']/text()" ).extract() if len(commnum): commnum = re.sub('\D', '', commnum[0]) commnum = commnum if len(commnum) > 0 else 0 else: commnum = 0 name = hotel.xpath( "li[@class='searchresult_info_name']/h2/a/text()").extract()[0] self.listPageInfo.append({ "guid": uuid.uuid1(), "url": url, "hotel_name": name, "OTA": self.__ota_info, "comm_num": int(commnum), "address": address })
def parse(self, response): f = open('glassdoor_northwest.csv', 'a') writer = csv.writer(f) text = response.xpath('//div[@class="hreview"]').extract() #user = [] for items in text: items = HtmlResponse(url="my html string", body=items, encoding='utf-8') date = items.xpath('//time[@class="date subtle small"]/text()').extract() author = items.xpath('//span[@class = "authorJobTitle reviewer"]/text()').extract() location = items.xpath('//span[@class = "authorLocation"]/text()').extract() work_exp = items.xpath('//p[@class = " tightBot mainText"]/text()').extract() pros = items.xpath('//p[@class = " pros mainText truncateThis wrapToggleStr"]/text()').extract() cons = items.xpath('//p[@class = " cons mainText truncateThis wrapToggleStr"]/text()').extract() string = str(pros) + " " + str(cons) #user.append([date, author, location, work_exp, pros, cons]) writer.writerow([str(date).replace("\'","").replace('\\t','').replace('\\r',"").replace('\\n',' ').replace('\\','').lstrip('[').rstrip(']').replace("', '",'').replace('\xa0',''), str(author).replace("\'","").replace('\\t','').replace('\\r',"").replace('\\n',' ').replace('\\','').lstrip('[').rstrip(']').replace("', '",'').replace('\xa0',''), str(location).replace("\'","").replace('\\t','').replace('\\r',"").replace('\\n',' ').replace('\\','').lstrip('[').rstrip(']').replace("', '",'').replace('\xa0',''), str(work_exp).replace("\'","").replace('\\t','').replace('\\r',"").replace('\\n',' ').replace('\\','').lstrip('[').rstrip(']').replace("', '",'').replace('\xa0',''), str(string).replace("\'","").replace('\\t','').replace('\\r',"").replace('\\n',' ').replace('\\','').lstrip('[').rstrip(']').replace("', '",'').replace('\xa0','')])
def spider(): options = Options() options.add_argument('-headless') driver = webdriver.Chrome(options=options) # 以上三行设置 Chrome 浏览器无头模式,可以有效提高程序运行速度 # 测试阶段可以注释掉上面三行,使用下面这一行启动谷歌驱动,打开浏览器 # driver = webdriver.Chrome() url = 'https://www.shiyanlou.com/courses/427' driver.get(url) # 打开待爬取页面 result = [] while True: driver.implicitly_wait(3) # 隐式等待 3 秒 html = driver.page_source response = HtmlResponse(url=url, body=html.encode()) for comment in response.css('div.comment-item'): d = { 'username': comment.css('a.name::text').extract_first().strip(), 'content': comment.css('div.content::text').extract_first( ).strip() } result.append(d) # 如果第二个 li 标签 class 属性值包含 disalbed 字段,表示没有下一页了 if 'disabled' in response.xpath('(//li[contains' '(@class, "page-item")])[2]/@class').extract_first(): break # 定位到第二个 li 标签,也就是“下一页”那个按钮 ac = driver.find_element_by_xpath( '(//li[contains(@class, "page-item")])[2]') # chromedirver 无法自动定位到当前页面未显示区域,下面这行代码起到定位作用 ActionChains(driver).move_to_element(ac).perform() time.sleep(1) # 等待按钮加载 ac.click() # 点击下一页按钮 driver.quit() with open('comments.json', 'w') as f: json.dump(result, f)
def vacancy_parse(self, response: HtmlResponse): name = response.css('div.vacancy-title h1::text').extract_first() salary = response.xpath( "//p[@class='vacancy-salary']/span/text()").extract() link = response.url # print(name,salary) yield JobparserItem(name=name, salary=salary, link=link)
def book_parse(self, response: HtmlResponse): name = response.css("h1::text").extract_first() author = response.xpath( "//div[@class='item-tab__chars-item']//span[contains(text(),'Автор')]//..//span//a//text()" ).extract() main_price = response.css( "div.item-actions__price-old::text").extract_first() discount_price = response.xpath( "//div[@class='item-actions__price']//b//text()").extract_first() rating = 0 #response.xpath("//div[@id='rate']//text()").extract_first() yield JobparserItem(name=name, href=response.url, author=author, main_price=main_price, discount_price=discount_price, rating=rating)
def set_secure_headers(self, html_url): """设置authorization、x-guest-token""" home_page_content = requests.get(url=html_url, headers=self.common_headers).text x_guest_token = re.search('decodeURIComponent\("gt=(.*?);', home_page_content, re.S).group(1) # 2.找出其中的js home_page_response = HtmlResponse(url=html_url, body=home_page_content, encoding='utf-8') token_js_url = home_page_response.xpath( '//link[@rel="preload"][last()]/@href').extract_first() js_content = requests.get(url=token_js_url, headers=self.common_headers).text authorization_code = re.search('a="Web-12",s="(.*?)"', js_content, re.S).group(1) headers = { 'Connection': 'keep-alive', 'authorization': 'Bearer {}'.format(authorization_code), 'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/80.0.3987.116 Safari/537.36', 'x-guest-token': x_guest_token, 'Accept': '*/*', 'Accept-Language': 'zh-CN,zh;q=0.9' } self.request_headers.update(headers)
def vacansy_parse(self, response: HtmlResponse): name_vac = response.css('h1::text').extract_first() salary_vac = response.xpath( "//span[@class='bloko-header-2 bloko-header-2_lite']/text()" ).extract() yield JobparserItem(name=name_vac, salary=salary_vac)
def parse(self, response: HtmlResponse): ads_links = response.xpath( '//a[@class="item-description-title-link"]/@href| ' '//a[@class="description-title-link js-item-link"]/@href').extract( ) for link in ads_links: yield response.follow(link, self.parse_ads)
def vacancy_parse(self, response: HtmlResponse): link = response.url name = response.xpath('//h1[@class=\'header\']//span/text()').extract_first() salary = response.css('div.vacancy-title p.vacancy-salary::text').extract() salary = self.format_salary(salary) print(link, name, salary) yield JobparserItem(name=name, salary=salary, link=link, site="hh.ru")
def vacansy_parce(self, response: HtmlResponse): link = response.url name = response.xpath("//h1/text()").extract_first() salary = response.xpath( "//p[@class='vacancy-salary']/span/text()").extract() company_name = response.xpath( "//a[@data-qa='vacancy-company-name']/span/text() | //a[@data-qa='vacancy-company-name']/span/span/text()" ).extract() company_address = response.xpath( "//p[@data-qa='vacancy-view-location']/text() | //p[@data-qa='vacancy-view-location']/span/text()" ).extract() yield JobparserItem(name=name, salary=salary, company_name=company_name, company_address=company_address, link=link)
def parse(self, response): print "test point" response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body) url = response.url #first_active = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a').extract() #active_page_list = response.xpath('//*[@id="zh-profile-activity-page-list"]/div/div[1]/a[@class="question_link" or @class="post-link"]/text()').extract() active_page_list = response.xpath('//*[@id="zh-profile-activity-page-list"]/div') file_obj = open('active_now.log', 'w') for active_block in active_page_list: active = active_block.xpath('.//div[1]/text()').extract()[1].strip() question = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/text()').extract() answer_link_list = active_block.xpath('.//div[1]/a[@class="question_link" or @class="post-link"]/@href').extract() answer_link = "" if len(answer_link_list) > 0: answer_link = answer_link_list[0] question_txt = "" if len(question) > 0: question_txt = question[0] if 'http' not in answer_link: answer_link = "http://www.zhihu.com" + answer_link file_obj.write(active.encode('utf-8') + '\t' + question_txt.encode('utf-8') + '\t' + answer_link.encode('utf-8') + '\n') # file_obj.write('\n') print answer_link file_obj.close()
def parse_news(self, response): self.logger.info('parse_news: %s' % response) parsed_news = json.loads(str(response.body))[0] # Initialize item loader # extract news title, published_at, author, content, url loader = ItemLoader(item=News(), response=response) loader.add_value('url', parsed_news['url']) if not parsed_news['title']: # Will be dropped on the item pipeline return loader.load_item() loader.add_value('title', parsed_news['title']) # Convert HTML text to a scrapy response html_response = HtmlResponse(url=parsed_news['url'], body=parsed_news['content'].encode('utf-8', 'ignore')) xpath_query = ''' //body/node() [not(descendant-or-self::comment()| descendant-or-self::style| descendant-or-self::script| descendant-or-self::div| descendant-or-self::span| descendant-or-self::image| descendant-or-self::img| descendant-or-self::iframe )] ''' raw_content_selectors = html_response.xpath(xpath_query) if not raw_content_selectors: # Will be dropped on the item pipeline return loader.load_item() raw_content = raw_content_selectors.extract() raw_content = ' '.join([w.strip() for w in raw_content]) loader.add_value('raw_content', raw_content) if not parsed_news['published']: # Will be dropped on the item pipeline return loader.load_item() # Parse date information # Example: 12 Oct 2016 - 05:25 date_time_str = ' '.join([_(w) for w in parsed_news['published'].split(',')[1].strip()[:-4].split(' ')]) try: published_at_wib = datetime.strptime(date_time_str, '%d %b %Y - %H:%M') except ValueError: # Will be dropped on the item pipeline return loader.load_item() published_at = wib_to_utc(published_at_wib) loader.add_value('published_at', published_at) if not parsed_news['author']: loader.add_value('author_name', '') else: loader.add_value('author_name', parsed_news['author']) # Move scraped news to pipeline return loader.load_item()
def spider(): body = open('test.html').read() response = HtmlResponse(url='http://example.com', body=body.encode('utf-8')) results = [] for company in response.xpath('//div[@class="media"]'): middle = company.xpath('.//li[@class="text-muted"]\ /span/text()').extract()[1:] result = dict( title = company.xpath('.//h4[@class="media-heading"]\ /a/text()' ).extract_first(), site = company.xpath('.//a[@class="company-site"]\ /@href' ).extract_first(), logo = company.xpath('.//div[@class="img-warp"]/a[@class= \ "company-logo"]/@style' ).re_first\ ("background-image: url\('(.+)'\)"), desc = company.xpath('.//p[@class="company-desc"]\ /text()' ).extract_first(), location = company.xpath('.//li[@class="text-muted "]\ /span/text()' ).extract_first(), field = middle[0] if middle else '' ) results.append(result) with open('../datas/companies.json', 'w') as f: f.write(json.dumps(results))
def parse(self, response): items = json.loads(response.body.decode('utf-8'))['items'] pub_dt = None for i in items: resp = HtmlResponse(url='', body=i['html'], encoding='utf8') link = resp.xpath('//a/@href').extract()[0] pub_dt = datetime.fromtimestamp(i['publish_date_t']) if pub_dt.date() >= self.until_date: yield scrapy.Request(url=link, callback=self.parse_document, meta={"pub_dt": pub_dt}) # Requesting page if publication date of the last article is above "until_date" if pub_dt and pub_dt.date() >= self.until_date: # Forming the next page link link_url = self.link_tmpl.format(int(pub_dt.timestamp())) yield scrapy.Request(url=link_url, priority=100, callback=self.parse, meta={'page_depth': response.meta.get('page_depth', 1) + 1} )
def parse(self, response: HtmlResponse): """ http:// ajx : print(response.body.decode('utf-8')) 获取源代码 """ fans_list = response.xpath('//ul[@class="user-list"]//li') for fans in fans_list: item = {} # 粉丝名字 item['fans_name'] = fans.xpath( './div[@class="info"]/a/text()').extract_first('') # 粉丝数量 item['fans_sum'] = fans.xpath( './div[@class="info"]/div[1]/span[1]/text()').extract_first('') # 链接 fans_href = fans.xpath( './div[@class="info"]/a/@href').extract_first('') fans_href = fans_href.split('/')[-1] fans_href = 'https://www.jianshu.com/users/' + fans_href + '/followers?page={}' count = int( fans.xpath('./div[@class="info"]/div[1]/span[2]/text()').re( '粉丝 (.*)')[0]) for i in range(1, count // 9 + 1): yield scrapy.Request(url=fans_href.format(i), callback=self.parse) yield item
def vacancy_parse(selfself, response: HtmlResponse): name_job = response.xpath('//h1/text()').extract_first() salary_job = response.xpath( '//span[@class="_1OuF_ ZON4b"]//text()').extract() location_job = response.xpath( '//div[@class="f-test-address _3AQrx"]//text()').extract() position_link = response.url company_job = response.xpath( '//span[@class="_3mfro _1hP6a _2JVkc _2VHxz"]/text() |' ' //h2[@class="_3mfro PlM3e _2JVkc _2VHxz _3LJqf _15msI"]/text()' ).extract_first() yield JobparserItem(name=name_job, salary=salary_job, location=location_job, link=position_link, company=company_job)
def parse(self, response: HtmlResponse): if response.url == 'https://www.kommersant.ru/rubric/3': rubric = 'economics' else: rubric = 'finance' topics = response.xpath( '//div[@class="grid_cell grid_cell_big js-middle"]//h4[contains(@class, "uho")]/a//text()').extract() resumes = response.xpath( '//div[@class="grid_cell grid_cell_big js-middle"]//h3[contains(@class, "uho")]/a//text()').extract() hrefs = response.xpath( '//div[@class="grid_cell grid_cell_big js-middle"]//h3[contains(@class, "uho")]/a/@href').extract() for i, href in enumerate(hrefs): variable = {'rubric': rubric, 'topic': topics[i], 'resume': resumes[i], 'href': href} yield response.follow(href, callback=self.get_item, meta={'attrs': deepcopy(variable)})
def parse(self, response: HtmlResponse): # '/html/body/div[1]/div[2]/div[2]/div[3]/div[4]/div/div[1]/div[2]/div/div[2]/div[1]/div[1]/div[1]/h3/a' # urls = response.xpath('//div[contains(@data-marker, "item")]/div[@class="item__line"]//h3/a[@itemprop="url"]') for url in response.xpath( '//div[contains(@data-marker, "item")]/div[@class="item__line"]//h3/a[@itemprop="url"]'): yield response.follow(url, callback=self.avd_parse)
def parse(self, response): response = HtmlResponse(url=self.shops_root_url, body=response.body) all_links = response.xpath('*//a/@href').extract() link_index = 0 for link in all_links: link_index = link_index + 1 yield SplashRequest(url=self.shops_root_url, callback=self.parse_via_pages, endpoint='execute', args={'lua_source': script,'link_index':link_index})
def parse_page(self, response): #print "test point" response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body) url = response.url name = response.xpath('//h2[@class="zm-item-title zm-editable-content"]/text()').extract() context_list = response.xpath('//div[@class="zm-editable-content"]/text()').extract() print name[0] for context in context_list: print context answer_num = response.xpath('//h3/@data-num').extract() if len(answer_num) == 0: print 1 else: print answer_num[0] author_list = response.xpath('//*[@class="author-link"]/text()').extract() for author in author_list: print author
def parse(self, response): response = HtmlResponse(url=response.url, status=response.status, headers=response.headers, body=response.body) url = response.url title = " ".join(response.xpath('/html/body/div[@class="container"]/div[@class="bodybox"]/div[1]/div[2]/h1/text()').extract()) header = response.xpath('/html/body/div[@class="container"]/div[@class="bodybox"]/div[1]/div[2]/h2/text()').extract() if len(header) <= 1: return paragraph_list = response.xpath('/html/body/div[@class="container"]/div[@class="bodybox"]/div[1]/div[2]/div[2]/p/text()').extract() body = '\n'.join(paragraph_list) time_source = header[0] related_industry = "" related_theme = "" ri_key = "关联行业" rt_key = "关联概念" for item in header: print item if ri_key.decode('utf-8') in item: related_industry = item.split(u':')[1] #continue if rt_key.decode('utf-8') in item: related_theme = item.split(u':')[1] if related_theme == "": return file_name = './' + url.split('_')[1] + '.ycj' #print url, title, related_theme, body news = News() news['title'] = title news['url'] = url news['time_source'] = time_source news['related_industry'] = related_industry news['related_theme'] = related_theme news['body'] = body return news
def __parseHotelRoomInfo(self, page_source, hotel_id): response = HtmlResponse(url="My HTML String", body=page_source, encoding="utf-8") hotel_price_list_len = len(response.xpath("//div[@class='hotel_price_body']/div")) hotel_price_body_dom = response.xpath("//div[@class='hotel_price_body']") crawl_time = datetime.datetime.now().strftime('%Y-%m-%d') if hotel_price_list_len < 2: return False else: for i in range(2, hotel_price_list_len+1): room_item_list_len = len(hotel_price_body_dom.xpath("div[%d]/div[@class='fleft s2']/div[@class='item']"%i)) room_item_list = hotel_price_body_dom.xpath("div[%d]/div[@class='fleft s2']"%i) room_name = hotel_price_body_dom.xpath("div[%d]/div[@class='fleft s1']/div/p[@class='name']/text()"%i).extract()[0] if room_item_list_len > 0: for j in range(1, room_item_list_len+1): description = room_item_list.xpath("div[%d]/div[@class='m1 fleft']/span/text()"%(j+1)).extract()[0] bed_type = room_item_list.xpath("div[%d]/div[@class='m2 fleft']/span/text()"%(j+1)).extract()[0] breakfast = room_item_list.xpath("div[%d]/div[@class='m3 fleft']/span/text()"%(j+1)).extract()[0] wifi = room_item_list.xpath("div[%d]/div[@class='m4 fleft']/span/a/text()"%(j+1)).extract()[0] cancel_policy = room_item_list.xpath("div[%d]/div[@class='m5 fleft']/span/a/text()"%(j+1)).extract()[0] price = room_item_list.xpath("div[%d]/div[@class='m6 fleft']//span[@class='digit']/text()"%(j+1)).extract()[0] self.priceList.append({"guid":uuid.uuid1(),"room_name":room_name, "description":description, "bed_type":bed_type, "breakfast":breakfast, "wifi":wifi, "cancel_policy":cancel_policy, "price": int(price),"crawl_time":crawl_time, "hotel_id":hotel_id})
def parse(self, response): #pprint.pprint("------------------------------") cadena_temp_1 = response.body.split("<TABLE CELLSPACING=1>") cadena_temp_1 = cadena_temp_1[1].split("</TABLE>") cadena_temp_1[0] = '<HTML><BODY><TABLE CELLSPACING=1>'.lower() + cadena_temp_1[0].lower() + '</TABLE></BODY></HTML>'.lower() response_2 = HtmlResponse(url="http://nuforc.org/webreports/ndxevent.html", body=cadena_temp_1[0]) for registro in response_2.xpath('.//body/table/tbody/tr'): item_tabla = CrawlerUfoItem() item_tabla['report_href'] = registro.xpath('td[1]/font/a/@href').extract()[0] item_tabla['report_text'] = registro.xpath('td[1]/font/a/text()').extract()[0] item_tabla['count'] = registro.xpath('td[2]/font/text()').extract()[0] #pprint.pprint(item_tabla) url_nuevo = 'http://nuforc.org/webreports/' + item_tabla['report_href'] yield scrapy.Request(url_nuevo, body = "", method = 'GET', headers={"content-type":"application/x-www-form-urlencoded"}, callback = self.parse_2, dont_filter = True)
def __parseHotelComment(self, page_source, hotel_id, comm_type): response = HtmlResponse(url="My HTML String", body=page_source, encoding="utf-8") remarkDom = response.xpath("//div[@class='user_remark_datail']") remarkDomLen = len(response.xpath("//div[@class='user_remark_datail']/div")) # 记录抓取页的评论内容跟已保存评论的相同数目 same_num = 0 for i in range(1, remarkDomLen+1): id = uuid.uuid1() # 用户名 username = remarkDom.xpath("div[%d]/div[@class='a1']/div[@class='b2']/text()"%i).extract() username = username[0] if len(username) > 0 else "" # 评论文本 remarkText = remarkDom.xpath("div[%d]/div[@class='a2']/div[@class='b2']/p/text()"%i).extract() remark = "" for str in remarkText: remark = remark + re.sub("\s+", "", str) # 评论时间 comm_time = remarkDom.xpath("div[%d]/div[@class='a2']/div[@class='b4']/div[@style='float: right;']/text()"%i).extract()[0] # 用户类型 user_type = "" senti_value = None viewpoint = None try: user_type = remarkDom.xpath("div[%d]/div[@class='a1']/div[@class='b3']/text()"%i).extract()[0] senti_value = self.hotelNLP.sentiment(remark.encode("utf-8")) viewpoint = json.dumps(self.hotelNLP.viewpoint(remark.encode("utf-8"),decoding="utf-8")) except: traceback.print_exc() comm = {"guid":id, "username":username, "remark":remark, "comm_time":comm_time, "user_type":user_type, "hotel_id":hotel_id, "comm_type":comm_type, "senti_value":senti_value, "viewpoint":viewpoint} if self.__is_exist_in_comment_list(comm): same_num += 1 else: self.commList.append(comm) if same_num == remarkDomLen: return False else: return True
def parse(self, response): data = json.loads(response.body_as_unicode()) stores = data['markers'] for store in stores: html = HtmlResponse( url="", body=store['info'].encode('UTF-8') ) unp = {} unp['lat'] = store['lat'] unp['lon'] = store['lng'] if unp['lat']: unp['lat'] = float(unp['lat']) if unp['lon']: unp['lon'] = float(unp['lon']) unp['ref'] = store['locationId'] unp['addr_full'] = html.xpath('//div[contains(@class, "addr")]/text()').extract_first() unp['phone'] = html.xpath('//div[contains(@class, "phone")]/text()').extract_first() unp['name'] = html.xpath('//div[@class="loc-name"]/text()').extract_first() addr2 = html.xpath('//div[contains(@class, "csz")]/text()').extract_first() if addr2: addr2 = addr2.strip() three_pieces = self.addr2regex.search(addr2) if three_pieces: city, state, zipcode = three_pieces.groups() unp['city'] = city unp['state'] = state unp['postcode'] = zipcode properties = {} for key in unp: if unp[key]: properties[key] = unp[key] yield GeojsonPointItem(**properties)
def parse_3(self, response, item): cadena_temp_2 = response.body.lower().replace("<p>","") response = HtmlResponse(url=response.url, body=cadena_temp_2) #pprint.pprint(response.xpath('.//body/table/font/caption/b/text()').extract()[0]) #pprint.pprint("******************************") pprint.pprint(item) for registro in response.xpath('.//body/table/tbody'): if not registro.xpath('tr[1]/td/font').extract(): item["detalle1"] = "" else: item["detalle1"] = registro.xpath('tr[1]/td/font').extract()[0] if not registro.xpath('tr[2]/td/font').extract(): item["detalle2"] = "" else: item["detalle2"] = registro.xpath('tr[2]/td/font').extract()[0] #pprint.pprint(item_tabla_3) yield item
def parse_2(self, response): cadena_temp_1 = response.body.split("<TABLE CELLSPACING=1>") cadena_temp_1 = cadena_temp_1[1].split("</TABLE>") cadena_temp_1[0] = ('<HTML><BODY><TABLE CELLSPACING=1>' + cadena_temp_1[0] + '</TABLE></BODY></HTML>').lower() response = HtmlResponse(url=response.url, body=cadena_temp_1[0]) #pprint.pprint("++++++++++++++++++++++++++++++") for registro in response.xpath('.//body/table/tbody/tr'): item = Crawler_2Item() if not registro.xpath('td[1]/font/a/text()').extract(): item["date_text"] = "" else: item["date_text"] = registro.xpath('td[1]/font/a/text()').extract()[0] if not registro.xpath('td[1]/font/a/@href').extract(): item["date_href"] = "" else: item["date_href"] = registro.xpath('td[1]/font/a/@href').extract()[0] if not registro.xpath('td[2]/font/text()').extract(): item["city"] = "" else: item["city"] = registro.xpath('td[2]/font/text()').extract()[0] if not registro.xpath('td[3]/font/text()').extract(): item["state"] = "" else: item["state"] = registro.xpath('td[3]/font/text()').extract()[0] if not registro.xpath('td[4]/font/text()').extract(): item["shape"] = "" else: item["shape"] = registro.xpath('td[4]/font/text()').extract()[0] if not registro.xpath('td[5]/font/text()').extract(): item["duration"] = "" else: item["duration"] = registro.xpath('td[5]/font/text()').extract()[0] if not registro.xpath('td[6]/font/text()').extract(): item["summary"] = "" else: item["summary"] = registro.xpath('td[6]/font/text()').extract()[0] if not registro.xpath('td[7]/font/text()').extract(): item["posted"] = "" else: item["posted"] = registro.xpath('td[7]/font/text()').extract()[0] #pprint.pprint(item_tabla_2) url_nuevo = 'http://nuforc.org/webreports/' + item["date_href"] item["detalle1"] = "" item["detalle2"] = "" yield scrapy.Request(url_nuevo , body = "", method = 'GET', headers={"content-type":"application/x-www-form-urlencoded"}, dont_filter = True, callback = lambda r : self.parse_3(r, item) )
def parse(self, response): data = json.loads(response.body_as_unicode()) stores = data['Results'] for store in stores: url = 'https://www.concentra.com{}'.format(store['Url']) lat, lon = None, None if 'Geospatial' in store: geospatial = store['Geospatial'] if 'Latitude' in geospatial: lat = geospatial['Latitude'] if 'Longitude' in geospatial: lon = geospatial['Longitude'] # Most of the data is stored as an html blob inside the json # so build a new HtmlResponse from it which we can parse. html = HtmlResponse( url=url, body=store['Html'].encode('utf-8') ) addr1 = html.xpath('//div[@class="field-addressline1"]/text()').extract_first() addr2 = html.xpath('//div[@class="field-addressline2"]/text()').extract_first() postcode = html.xpath('//span[@class="field-zipcode"]/text()').extract_first() phone = html.xpath('//div[@class="field-mainphone"]/text()').extract_first() state = html.xpath('//span[@class="field-stateabbreviation"]/text()').extract_first() city = html.xpath('//div[@class="field-centername"]/text()').extract_first() name = html.xpath('//div[@class="location-clinic-link"]/a/@title').extract_first() if addr1: addr1 = addr1.strip() if addr2: addr2 = addr2.strip() addr_full = None if addr1 and addr2: addr_full = ' '.join([addr1, addr2]) elif addr1: addr_full = addr1 properties = {} properties['ref'] = store['Id'] properties['website'] = url if addr_full: properties['addr_full'] = addr_full if name: properties['name'] = name if city: properties['city'] = city if state: properties['state'] = state if postcode: properties['postcode'] = postcode if phone: properties['phone'] = phone.replace('.', '-') if lat: properties['lat'] = lat if lon: properties['lon'] = lon yield GeojsonPointItem(**properties)
def __parseUrls(self,page_source): response = HtmlResponse(url="My HTML String",body=page_source,encoding="utf-8") hotel_list = response.xpath("//div[@class='h_list']/div[@class='h_item']") for hotel in hotel_list: url = hotel.xpath(".//p[@class='h_info_b1']/a/@href").extract()[0] name = hotel.xpath(".//p[@class='h_info_b1']/a/@title").extract()[0] address = hotel.xpath(".//p[@class='h_info_b2']/text()").extract()[1] commnum = hotel.xpath(".//div[@class='h_info_comt']/a/span[@class='c555 block mt5']/b/text()").extract() if len(commnum)==0: commnum = 0 else:commnum = commnum[0] self.listPageInfo.append({ "guid": uuid.uuid1(), "url": url, "hotel_name": name, "OTA": self.__ota_info, "comm_num": commnum, "address": address }) pass
def crawllianjie(self,page_sourse): response = HtmlResponse(url="my HTML string",body=page_sourse,encoding="utf-8") A = response.xpath("//div[@class='searchresult_list ']/ul") # 获取每个酒店的链接 for B in A: url = B.xpath("li[@class='searchresult_info_name']/h2/a/@href").extract() # 评论 commnum = B.xpath("li[@class='searchresult_info_judge ']/div/a/span[@class='hotel_judgement']/text()").extract() if len(commnum): Discuss = re.sub('\D','',commnum[0]) if len(Discuss): pass else: Discuss = 0 else: Discuss = 0 self.listPageInfo.append({"url":url[0], "comm_num":Discuss, "city":"南京"}) xiechengService.saveListPageInfo() if len(self.listPageInfo) == 25: pass else: print len(self.listPageInfo) self.listPageInfo = []
print e.code print e.read() continue except urllib2.URLError,e: print e.code print e.read() continue except httplib.HTTPException,e: print e.code() print e.read() continue post_soup = BeautifulSoup(post_page, "lxml") post_response = HtmlResponse(url=page_url.url, body=str(post_soup)) path = old_path version = 0 # 0: old 1:new reply_num = post_response.xpath(path['reply_num']).extract() if reply_num == []: version = 1 #print "new verion!" path = new_path reply_num = post_response.xpath(path['reply_num']).extract() if reply_num == []: reply_num = 0 else: reply_num = int(reply_num[0].strip().split(' ')[0]) else: reply_num = int(reply_num[0].strip().split(' ')[0]) #print "old version" if reply_num == 0: continue
def crawlHotelInfo(self,target): #target来自于baseinfo表 url = target[1] self.openPage("http://hotel.elong.com"+url) self.wait(3) time.sleep(random.uniform(1,3)) # 如果网址失效 return if self.isAlertPresent(): return False response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8") # 解析酒店页面信息 if self.if_crawl_hotel_info is True: self.__parseHotelInfo(response, target) pass # 解析酒店房间信息 if self.if_crawl_hotel_price is True: record_time = 0 while 1: try: self.priceList = [] self.__parseHotelRoomInfo(self.driver.page_source, target[0]) break except: time.sleep(2) record_time += 1 if record_time > 3: break # 抓取酒店点评 if self.if_crawl_hotel_comment is True: self.commList = [] # TODO 若评论数大于0 冗余可改 if target[4]>0: self.driver.find_element_by_xpath("//body").send_keys(Keys.END) self.wait(2) read_time = 0 comm_page_num = '' while 1: try: response = HtmlResponse(url="My HTML String", body=self.driver.page_source, encoding="utf-8") hotelname = "" page_a = response.xpath("//div[@id='comment_paging']/a") if len(page_a)==1 : comm_page_num = page_a.xpath(".//text()").extract()[0] if len(page_a)>1: comm_page_num = page_a.xpath(".//text()").extract()[-2] except Exception , e: print e time.sleep(1) read_time+=1 if read_time>10:#10次等待 break if comm_page_num != '': break print "评论共有:",comm_page_num,"页" if self.__crawlHotelComment(self.driver, target[0], comm_page_num): print "" print "共抓取了",len(self.commList),"条评论,存储到commList中"
# coding=utf-8 import re import json import requests from scrapy.http import HtmlResponse response = HtmlResponse( url='http://weixin.sogou.com/gzhjs?cb=sogou.weixin.gzhcb&openid=oIWsFt_Id9NTbaO6ms2zvSBm2RzI&eqs=qBsQoCeguK%2B0ofdI%2B6h3FuvrCqfh1RlwTme4vOefG9aBeZd%2BPz%2FN4dn91sq5UJD2r2xev&ekv=3&page=1') # response.selector.xpath('//span/text()').extract() # response.xpath('//title/text()') # Selector(response=response).xpath('//span/text()').extract() content = requests.get(response.url).content # doc=u""" # <span id="J_realContact" data-real="电话021-60131333 传真021-60131356 <a target='_blank' href='http://my.ctrip.com/uxp/Community/CommunityAdvice.aspx?producttype=3&categoryid=65'>纠错</a>" style="color:#0066cc;cursor:pointer;">联系方式</span> # """ # print content # regex = re.compile(r"sogou\.weixin\.gzhcb\((.*\])\}\)") # print regex.findall(content) content = re.search(r'\{.*\]\}', content).group() docs = "" for i in json.loads(content)["items"]: docs += i se = HtmlResponse(url="http://www.qq.com", body=docs, encoding="utf8") print se.xpath("//item//docid/text()").extract()
def __parseHotelRoomInfo(self, page_source, hotel_id): response = HtmlResponse(url="My HTML String", body=page_source, encoding="utf-8") rooms_list = response.xpath("//div[@class='htype_list']/div") rooms_list_len = len(rooms_list) if rooms_list_len<0: return False crawl_time = datetime.datetime.now().strftime('%Y-%m-%d') for rooms in rooms_list: #房间名称 roomname = rooms.xpath(".//p[@class='htype_info_name']/span/text()").extract()[0] #房间大小 roomarea = rooms.xpath(".//p[@class='htype_info_ty']/span[1]/text()").extract() if len(roomarea)!=0: roomarea = roomarea[0] else: roomarea = '' #床型 bedtype = rooms.xpath(".//p[@class='htype_info_ty']/span[3]/text()").extract() if len(bedtype)!=0: bedtype = bedtype[0] else: bedtype = '' #人数 havenum = rooms.xpath(".//p[@class='htype_info_ty']/span[3]/span/text()") if havenum: peoplecount = str(havenum.extract()[0]) else: peoplecount = str(len(rooms.xpath(".//p[@class='htype_info_ty']/span[5]/i"))) if peoplecount == '0': peoplecount = '未说明' #楼层 roomsfloor = rooms.xpath(".//p[@class='htype_info_ty']/span[7]/text()").extract() if len(roomsfloor)!=0: roomsfloor = roomsfloor[0] else: roomsfloor = '' havewifi = rooms.xpath(".//p[@class='htype_info_ty']/span[9]/text()").extract() if len(havewifi)!=0: havewifi = havewifi[0] else: havewifi = '' list = rooms.xpath(".//table[@class='htype-table']/tbody/tr[@data-handle='rp']") descriptions = rooms.xpath(".//td[@class='ht_other']//p/text()").extract() description = '' for d in descriptions: dstrip = d.strip() if u'查看更多产品报价' != dstrip: description +=dstrip for room in list: roomtype = room.xpath(".//td[@class='ht_name']/span/text()").extract()[0] supply = room.xpath(".//td[@class='ht_supply']/text()").extract()[0].strip() breakfast = room.xpath(".//td[@class='ht_brak']/text()").extract()[0] rule = room.xpath(".//td[@class='ht_rule']/span/text()").extract()[0] price = room.xpath(".//td[@class='ht_pri']/span[@class='ht_pri_h cur']/span/text()").extract()[0] self.priceList.append({ 'guid':uuid.uuid1(), 'room_name':roomname, 'room_area':roomarea, 'bed_type':bedtype, 'people_count':peoplecount, 'rooms_floor':roomsfloor, 'wifi':havewifi, 'description':description, 'room_type':roomtype, 'supply':supply, 'breakfast':breakfast, 'cancel_policy':rule, 'price':price, 'crawl_time':crawl_time, 'hotel_id':hotel_id }) pass
def getcommentinfo(self,page_sourse): response = HtmlResponse(url="my HTML string",body=self.driver.page_source,encoding="utf-8") commentData = response.xpath("//div[@class='comment_detail_list']/div[@class='comment_block J_asyncCmt']") title = response.xpath("//div[@class='main_detail_wrapper ']/div[@class='detail_main detail_main_no_tips']/div[@class='htl_info']/div[@class='name']/h2[@class='cn_n']/text()").extract() if len(title): Title = title else: Title = response.xpath("//div[@class='main_detail_wrapper ']/div[@class='detail_main detail_main_no_comment']/div[@class='htl_info']/div[@class='name']/h2[@class='cn_n']/text()").extract() for itemData in commentData: itemDict = dict() # 酒店名 hotelname = Title if len(hotelname): hotelnames = hotelname[0] else: hotelnames = " " # 用户名 username = itemData.xpath("div[1]/p[2]/span/text()").extract() if len(username): usernames = username[0] else: usernames = "" # 评论分 commentscore = itemData.xpath("div[2]/p/span[2]/span/text()").extract() if len(commentscore): commentscores = commentscore[0] else: commentscores = "" # 入住时间 intime = itemData.xpath("div[2]/p/span[3]/text()").extract() if len(intime): intimes = intime[0] else: intimes = "" # 出游类型 tourstyle = itemData.xpath("div[2]/p/span[4]/text()").extract() if len(tourstyle): tourstyles = re.sub('\w','',tourstyle[0]) else: tourstyles = "" # 点赞数量 praisenum = itemData.xpath("div[2]/div[@class='comment_txt']/div[@class='comment_bar']/a/span/text()").extract() if len(praisenum): Praisenum = re.sub('\D','',praisenum[0]) praisenums = Praisenum else: praisenums = "" # 评论发表时间 commenttime = itemData.xpath("div[2]/div[@class='comment_txt']/div[@class='comment_bar']/p/span/text()").extract() if len(commenttime): commenttimes = commenttime[0].split(u"于")[1] else: commenttimes = "" # 评论内容 comment = itemData.xpath("div[2]/div[@class='comment_txt']/div[1]/text()").extract() if len(comment): comments = comment[0] else: comments = "" self.commList.append({"title":hotelnames,"username":usernames,"commentscore":commentscores, "intime":intimes, "tourstyle":tourstyles, "praisenum":praisenums,"commenttime":commenttimes,"comment":comments})