def _extract_comments(tree): """解析出评论 Args: tree: ElementTree Returns: comments: list """ comment_elems = tree.xpath("//div[@id='REVIEWS']/div[@class='reviewSelector']") comments = [] for comment_elem in comment_elems: try: # 第一类型网页 comment_user = flist(comment_elem.xpath( "div/div[@class='col1of2']" "//div[@class='username mo']/span/text()"), u"") comment_score = flist(comment_elem.xpath( "div/div[@class='col2of2 ']" "/div[@class='rating reviewItemInline']" "/span[@class='rate rate_s s50']/img/@content"), u"") comment_time = flist(comment_elem.xpath( "div/div[@class='col2of2 ']" "/div[@class='rating reviewItemInline']" "/span[@class='ratingDate']/text()"), u"") content_elems = comment_elem.xpath( "div/div[@class='col2of2 ']/div[@class='entry']/p") texts = [] for content_elem in content_elems: if content_elem.text is not None: texts.append(content_elem.text) comment_content = u"".join(texts).strip() # 第二类型网页 if len(comment_user.strip()) == 0: comment_user = flist(comment_elem.xpath( "div/div[@class='col1of2']" "//div[@class='username']/span/text()"), u"") if len(comment_time.strip()) == 0: comment_time = flist(comment_elem.xpath( "div/div[@class='col2of2']" "/span[@class='ratingDate']/text()"), u"") if len(comment_content.strip()) == 0: content_elems = comment_elem.xpath( "div/div[@class='col2of2']" "/div[@class='entry']/p") texts = [] for content_elem in content_elems: if content_elem.text is not None: texts.append(content_elem.text) comment_content = u"".join(texts).strip() except Exception, e: print "extract one comment failed error:%s" % e print traceback.format_exc() else: if len(unicode(comment_content)) != 0: comment_item = CommentItem(unicode(comment_user).strip(), unicode(comment_time).strip(), unicode(comment_score).strip(), unicode(comment_content).strip()) comments.append(comment_item)
def parse(self, task, input_file): """解析函数 Args: task: HttpTask,任务 input_file: StringIO, 网页文件 Yields: item """ self.logger.debug("attraction parser start to parse") content = input_file.read() tree = html.parse(StringIO(content)) try: zip_code = flist(tree.xpath("//span[@class='postal-code']/text()"), u"") play_spend, play_spend_unit = _extract_play_spend_and_unit(content) tel_phone = flist(tree.xpath("//div[@id='HEADING_GROUP']" "/div[@class='wrap infoBox']" "/div[@class='odcHotel blDetails']" "/div/div[@class='fl']/text()"), u"") open_time = u"" total_score = flist(tree.xpath("//div[@class='rs rating']" "/span/img/@content")) ticket_info = u"" preview_relate_path = flist(tree.xpath( "//div[@class='listing_description']/a/@href"), u"") lon, lat = _extract_lon_lat(flist(tree.xpath( "//div[@class='js_mapThumb']" "/div[@id='bmapContainer']/img[1]/@src"), u"")) comments = _extract_comments(tree) # 生成景点信息(不包括description) attraction_item = AttractionItem(task.request.url, task.kwargs['name'], unicode(play_spend), play_spend_unit, task.kwargs['address'], unicode(tel_phone), unicode(open_time), unicode(total_score), unicode(ticket_info), task.kwargs['hot'], lon, lat, task.kwargs['rank'], comments, unicode(zip_code)) yield attraction_item # 生成description任务 if len(preview_relate_path) != 0: description_request = build_description_request( task.request.url, preview_relate_path) description_task = HttpTask(description_request, callback="DescriptionParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={'url': task.request.url}) yield description_task else: yield DescriptionItem(task.request.url, u"") except Exception, e: print "error:%s" % e print traceback.format_exc()
def _extract_address(info_elem): """解析地址 Args: info_elem: Element Returns: address: unicode, 地址信息 """ country = flist(info_elem.xpath("address/span[@class='country-name']/text()"), u"") city = flist(info_elem.xpath("address/span[@class='locality']/text()"), u"") streat_script = flist(info_elem.xpath("address/span/script/text()"), u"") streat = _extract_script_text(streat_script) return u"%s%s%s" % (country, city, streat)
def _extract_address(info_elem): """解析地址 Args: info_elem: Element Returns: address: unicode, 地址信息 """ country = flist( info_elem.xpath("address/span[@class='country-name']/text()"), u"") city = flist(info_elem.xpath("address/span[@class='locality']/text()"), u"") streat_script = flist(info_elem.xpath("address/span/script/text()"), u"") streat = _extract_script_text(streat_script) return u"%s%s%s" % (country, city, streat)
def parse(self, task, input_file): """详情解析器 Args: task, HttpTask, 任务 input_file: file, 网页文件 Yields: item: WebItem, 数据 task: HttpTask, 新任务 """ tree = html.parse(input_file) pic_url = unicode( flist(tree.xpath("//div[@class='product-price-left']" "/p/img/@src"), default=u"")) desc_elems = tree.xpath("//div[@class='product-detail-alla-cont']") description = _extract_desc_elems(desc_elems) date_elems = tree.xpath("//ul[@class='productnew-header-pricea2-ul" " clearfloat']/li/@d") telephone = flist(tree.xpath("//div[@class='top-w']//li[@class='tel']" "/span/text()"), default=u"") telephone = telephone.replace(u"-", u"") if len(telephone) == 0: telephone = u"4006228228" price_elems = tree.xpath("//ul[@class='productnew-header-pricec2-ul " "productnew-" "header-pricec3-ul productnew-header-" "pricec2-cq']/li/@title") price_infos = list() for price_elem in price_elems: if unicode(price_elem) not in price_infos: price_infos.append(unicode(price_elem)) price_info = u"/".join(price_infos) time_infos = [] for date_elem in date_elems: time_infos.append(date_elem) time_info = u";".join(time_infos) url = task.kwargs.get('url') cookie_host = task.kwargs.get('cookie_host') cookie_count = task.kwargs.get('cookie_count') pictures, pic_task = self._check_and_execute_picture( pic_url, cookie_host, cookie_count) # 保存详情信息 yield WebItem(url, telephone, description, pictures, time_info, price_info, u"") # 抛出picTask if pic_task is not None: yield pic_task
def _extract_play_spend(tree): """提取play spend和unit """ play_spend = flist(tree.xpath( "//div[@id='J-aside-info-recommend_visit_time']" "/span[@class='val recommend_visit_time-value']/text()"), u"") if play_spend.find(u"天") != -1: play_spend_unit = u"天" elif play_spend.find(u"小时") != -1: play_spend_unit = u"小时" elif play_spend.find(u"分") != -1 or play_spend.find(u"分钟") != -1: play_spend_unit = u"分" elif len(play_spend.strip()) == 0: play_spend_unit = u"" else: play_spend_unit = u"" replace_dict = { u"小时": u"", u"天": u"", u"时": u"", u"小": u"", u"分钟": u"", u"分": u"", u"钟": u"", u"至": u"-", u"超过": u">", u"半": u".5", u"一": u"1", u"二": u"2", u"三": u"3", u"四": u"4", u"五": u"5", u"六": u"6", u"七": u"7", u"八": u"8", u"九": u"9", } for key, value in replace_dict.iteritems(): play_spend = play_spend.replace(key, value) if play_spend.strip() == u".5": play_spend = u"0.5" return play_spend, play_spend_unit
def parse(self, task, input_file): """详情解析器 Args: task, HttpTask, 任务 input_file: file, 网页文件 Yields: item: WebItem, 数据 task: HttpTask, 新任务 """ tree = html.parse(input_file) pic_url = unicode(flist(tree.xpath("//div[@class='product-price-left']" "/p/img/@src"), default=u"")) desc_elems = tree.xpath("//div[@class='product-detail-alla-cont']") description = _extract_desc_elems(desc_elems) date_elems = tree.xpath("//ul[@class='productnew-header-pricea2-ul" " clearfloat']/li/@d") telephone = flist(tree.xpath("//div[@class='top-w']//li[@class='tel']" "/span/text()"), default=u"") telephone = telephone.replace(u"-", u"") if len(telephone) == 0: telephone = u"4006228228" price_elems = tree.xpath("//ul[@class='productnew-header-pricec2-ul " "productnew-" "header-pricec3-ul productnew-header-" "pricec2-cq']/li/@title") price_infos = list() for price_elem in price_elems: if unicode(price_elem) not in price_infos: price_infos.append(unicode(price_elem)) price_info = u"/".join(price_infos) time_infos = [] for date_elem in date_elems: time_infos.append(date_elem) time_info = u";".join(time_infos) url = task.kwargs.get('url') cookie_host = task.kwargs.get('cookie_host') cookie_count = task.kwargs.get('cookie_count') pictures, pic_task = self._check_and_execute_picture(pic_url, cookie_host, cookie_count) # 保存详情信息 yield WebItem(url, telephone, description, pictures, time_info, price_info, u"") # 抛出picTask if pic_task is not None: yield pic_task
def parse(self, task, input_file): """详情解析器 Args: task, HttpTask, 任务 input_file: file, 网页文件 Yields: item: WebItem, 数据 task: HttpTask, 新任务 """ tree = html.parse(input_file) name = flist(tree.xpath(u"//div[" u"@clas='product-price-titleul']/h1/text()")) desc_elems = tree.xpath(u"//div[@class='product-detail-alla-cont']") description = _extract_desc_elems(desc_elems) date_elems = tree.xpath( u"//ul[@class='productnew-header-pricea2-ul clearfloat']/li/@d") telephone = flist(tree.xpath( u"//div[@class='top-w']//li[@class='tel']/span/text()")) telephone = telephone.replace(u"-", u"") if len(telephone) == 0: telephone = u"4006228228" price_elems = tree.xpath( u"//ul[@class='productnew-header-pricec2-ul productnew-" u"header-pricec3-ul productnew-header-pricec2-cq']/li/@title") price_infos = list() for price_elem in price_elems: if unicode(price_elem) not in price_infos: price_infos.append(unicode(price_elem)) price_info = u"/".join(price_infos) time_infos = [] for date_elem in date_elems: time_infos.append(date_elem) time_info = u";".join(time_infos) url = task.request.url # 保存详情信息 yield WebItem(url, telephone, description, time_info, price_info, name)
def _extract_tips(tree): """提取出小提示 Args: tree: ElementTree Returns: tips: unicode """ elem = flist(tree.xpath("//div[@id='mod-attention']/" "article[@class='content-article']"), None) if elem is None: return u"" else: texts = [] for text_child in elem.itertext(): texts.append(text_child) return u"".join(texts)
def _extract_tips(tree): """提取出小提示 Args: tree: ElementTree Returns: tips: unicode """ elem = flist( tree.xpath("//div[@id='mod-attention']/" "article[@class='content-article']"), None) if elem is None: return u"" else: texts = [] for text_child in elem.itertext(): texts.append(text_child) return u"".join(texts)
def _extract_play_spend(tree): """提取play spend和unit """ play_spend = flist( tree.xpath("//div[@id='J-aside-info-recommend_visit_time']" "/span[@class='val recommend_visit_time-value']/text()"), u"") if play_spend.find(u"天") != -1: play_spend_unit = u"天" elif play_spend.find(u"小时") != -1: play_spend_unit = u"小时" elif play_spend.find(u"分") != -1 or play_spend.find(u"分钟") != -1: play_spend_unit = u"分" elif len(play_spend.strip()) == 0: play_spend_unit = u"" else: play_spend_unit = u"" replace_dict = { u"小时": u"", u"天": u"", u"时": u"", u"小": u"", u"分钟": u"", u"分": u"", u"钟": u"", u"至": u"-", u"超过": u">", u"半": u".5", u"一": u"1", u"二": u"2", u"三": u"3", u"四": u"4", u"五": u"5", u"六": u"6", u"七": u"7", u"八": u"8", u"九": u"9", } for key, value in replace_dict.iteritems(): play_spend = play_spend.replace(key, value) if play_spend.strip() == u".5": play_spend = u"0.5" return play_spend, play_spend_unit
def _extract_preview(tree): """用于提取preview(针对两种类型的网页) Args: tree: ElementTree Returns: previe: unicode, """ # 第一类型网页 preview_elems = tree.xpath("//div[@class='sidebar-mod-inner']/" "article/div[@class='']/p") preview_list = [] for preview_elem in preview_elems: preview_list.append(preview_elem.text) preview = "".join(preview_list) # 第二类型网页 if len(preview) == 0: preview = flist(tree.xpath("//div[@class='view-mod-desc-main']" "/div[@id='view-mod-abstract']/" "div[@class='desc-all-holder']" "/text()"), u"") return preview
def _extract_preview(tree): """用于提取preview(针对两种类型的网页) Args: tree: ElementTree Returns: previe: unicode, """ # 第一类型网页 preview_elems = tree.xpath("//div[@class='sidebar-mod-inner']/" "article/div[@class='']/p") preview_list = [] for preview_elem in preview_elems: preview_list.append(preview_elem.text) preview = "".join(preview_list) # 第二类型网页 if len(preview) == 0: preview = flist( tree.xpath("//div[@class='view-mod-desc-main']" "/div[@id='view-mod-abstract']/" "div[@class='desc-all-holder']" "/text()"), u"") return preview
def parse(self, task, input_file): """解析函数 Args: task: HttpTask,任务 input_file: StringIO, 网页文件 Yields: task: HttpTask, 新任务 """ tree = html.parse(input_file) attraction_elems = tree.xpath( "//div[@id='ATTRACTION_OVERVIEW']" "/div[@class='attraction-list clearfix']") for attraction_elem in attraction_elems: try: info_elem = flist( attraction_elem.xpath( "div[@class='clearfix']/div[@class='info']"), None) rank_elem = flist( attraction_elem.xpath( "div[@class='clearfix']/div[@class='rank']"), None) relate_path = flist( info_elem.xpath("div[@class='title']/a/@href"), u"") name = flist(info_elem.xpath("div[@class='title']/a/text()"), u"") address = _extract_address(info_elem) hot = flist(rank_elem.xpath("a/strong/text()"), u"") rank = flist(rank_elem.xpath("span[1]/strong/text()"), u"") # 形成attraction 任务 http_request = build_attraction_request(relate_path) attraction_task = HttpTask(http_request, callback="AttractionParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={ "name": unicode(name).strip(), "address": unicode(address), "hot": unicode(hot), "rank": unicode(rank) }) yield attraction_task except Exception, e: self.logger.warn("extract one attraction failed error:%s" % e)
def parse(self, task, input_file): """解析函数 Args: task: HttpTask,任务 input_file: StringIO, 网页文件 Yields: task: HttpTask, 新任务 """ tree = html.parse(input_file) attraction_elems = tree.xpath("//div[@id='ATTRACTION_OVERVIEW']" "/div[@class='attraction-list clearfix']") for attraction_elem in attraction_elems: try: info_elem = flist(attraction_elem.xpath( "div[@class='clearfix']/div[@class='info']"), None) rank_elem = flist(attraction_elem.xpath( "div[@class='clearfix']/div[@class='rank']"), None) relate_path = flist(info_elem.xpath( "div[@class='title']/a/@href"), u"") name = flist(info_elem.xpath( "div[@class='title']/a/text()"), u"") address = _extract_address(info_elem) hot = flist(rank_elem.xpath("a/strong/text()"), u"") rank = flist(rank_elem.xpath("span[1]/strong/text()"), u"") # 形成attraction 任务 http_request = build_attraction_request(relate_path) attraction_task = HttpTask( http_request, callback="AttractionParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={"name": unicode(name).strip(), "address": unicode( address), "hot": unicode(hot), "rank": unicode(rank)}) yield attraction_task except Exception, e: self.logger.warn("extract one attraction failed error:%s" % e)
def parse(self, task, input_file): """解析函数 Args: task: HttpTask,任务 input_file: StringIO, 网页文件 Yields: item """ self.logger.debug("attraction parser start to parse") content = input_file.read() tree = html.parse(StringIO(content)) try: zip_code = flist(tree.xpath("//span[@class='postal-code']/text()"), u"") play_spend, play_spend_unit = _extract_play_spend_and_unit(content) tel_phone = flist( tree.xpath("//div[@id='HEADING_GROUP']" "/div[@class='wrap infoBox']" "/div[@class='odcHotel blDetails']" "/div/div[@class='fl']/text()"), u"") open_time = u"" total_score = flist( tree.xpath("//div[@class='rs rating']" "/span/img/@content")) ticket_info = u"" preview_relate_path = flist( tree.xpath("//div[@class='listing_description']/a/@href"), u"") lon, lat = _extract_lon_lat( flist( tree.xpath("//div[@class='js_mapThumb']" "/div[@id='bmapContainer']/img[1]/@src"), u"")) comments = _extract_comments(tree) # 生成景点信息(不包括description) attraction_item = AttractionItem(task.request.url, task.kwargs['name'], unicode(play_spend), play_spend_unit, task.kwargs['address'], unicode(tel_phone), unicode(open_time), unicode(total_score), unicode(ticket_info), task.kwargs['hot'], lon, lat, task.kwargs['rank'], comments, unicode(zip_code)) yield attraction_item # 生成description任务 if len(preview_relate_path) != 0: description_request = build_description_request( task.request.url, preview_relate_path) description_task = HttpTask(description_request, callback="DescriptionParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={'url': task.request.url}) yield description_task else: yield DescriptionItem(task.request.url, u"") except Exception, e: print "error:%s" % e print traceback.format_exc()
class AttractionListParser(BaseParser): """用于解析Attraction list的解析器 """ def __init__(self, namespace): BaseParser.__init__(self, namespace) self.logger.info("init attraction list parser finish") def parse(self, task, input_file): """解析函数 Args: task: HttpTask,任务 input_file: StringIO, 网页文件 Yields: task: HttpTask, 新任务 """ tree = html.parse(input_file) attraction_elems = tree.xpath( "//div[@id='ATTRACTION_OVERVIEW']" "/div[@class='attraction-list clearfix']") for attraction_elem in attraction_elems: try: info_elem = flist( attraction_elem.xpath( "div[@class='clearfix']/div[@class='info']"), None) rank_elem = flist( attraction_elem.xpath( "div[@class='clearfix']/div[@class='rank']"), None) relate_path = flist( info_elem.xpath("div[@class='title']/a/@href"), u"") name = flist(info_elem.xpath("div[@class='title']/a/text()"), u"") address = _extract_address(info_elem) hot = flist(rank_elem.xpath("a/strong/text()"), u"") rank = flist(rank_elem.xpath("span[1]/strong/text()"), u"") # 形成attraction 任务 http_request = build_attraction_request(relate_path) attraction_task = HttpTask(http_request, callback="AttractionParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={ "name": unicode(name).strip(), "address": unicode(address), "hot": unicode(hot), "rank": unicode(rank) }) yield attraction_task except Exception, e: self.logger.warn("extract one attraction failed error:%s" % e) # 形成下一页任务 next_page_relate = flist( tree.xpath( "//div[@class='pagination']/div" "/a[@class='next sprite-arrow-right-green ml6 ']/@href"), u"") if len(next_page_relate) != 0: next_page_request = build_next_page_request(next_page_relate) next_page_task = HttpTask(next_page_request, callback="AttractionListParser", max_fail_count=5, cookie_host=LVYOU_HOST) yield next_page_task
def _extract_comments(tree): """解析出评论 Args: tree: ElementTree Returns: comments: list """ comment_elems = tree.xpath( "//div[@id='REVIEWS']/div[@class='reviewSelector']") comments = [] for comment_elem in comment_elems: try: # 第一类型网页 comment_user = flist( comment_elem.xpath("div/div[@class='col1of2']" "//div[@class='username mo']/span/text()"), u"") comment_score = flist( comment_elem.xpath( "div/div[@class='col2of2 ']" "/div[@class='rating reviewItemInline']" "/span[@class='rate rate_s s50']/img/@content"), u"") comment_time = flist( comment_elem.xpath("div/div[@class='col2of2 ']" "/div[@class='rating reviewItemInline']" "/span[@class='ratingDate']/text()"), u"") content_elems = comment_elem.xpath( "div/div[@class='col2of2 ']/div[@class='entry']/p") texts = [] for content_elem in content_elems: if content_elem.text is not None: texts.append(content_elem.text) comment_content = u"".join(texts).strip() # 第二类型网页 if len(comment_user.strip()) == 0: comment_user = flist( comment_elem.xpath("div/div[@class='col1of2']" "//div[@class='username']/span/text()"), u"") if len(comment_time.strip()) == 0: comment_time = flist( comment_elem.xpath("div/div[@class='col2of2']" "/span[@class='ratingDate']/text()"), u"") if len(comment_content.strip()) == 0: content_elems = comment_elem.xpath("div/div[@class='col2of2']" "/div[@class='entry']/p") texts = [] for content_elem in content_elems: if content_elem.text is not None: texts.append(content_elem.text) comment_content = u"".join(texts).strip() except Exception, e: print "extract one comment failed error:%s" % e print traceback.format_exc() else: if len(unicode(comment_content)) != 0: comment_item = CommentItem( unicode(comment_user).strip(), unicode(comment_time).strip(), unicode(comment_score).strip(), unicode(comment_content).strip()) comments.append(comment_item)
def parse(self, task, input_file): """parse method Args: task: Task, task input_file: file: file with content Yields: item: Item, result of parse task: Task, new task """ self.logger.debug("room parser begin to parse") try: try: soap_tree = etree.fromstring(input_file.read()) except Exception, e: self.logger.error("not complete xml:%s" % e) raise ParserError("not complete xml") hotel_address_dict = task.kwargs.get('address') soap_elems = xpath_namespace( soap_tree, "/soap:Envelope/soap:Body/" "RequestResponse/RequestResult") xml_str = soap_elems[0].text tree = etree.fromstring(xml_str) elems = tree.xpath("/Response/Header") header = elems[0] if "ResultCode" not in header.attrib or \ header.attrib['ResultCode'] != "Success": self.logger.error("not has resultcode or " "resultcode is not success") raise ParserError("ResultCode error") else: content_elems = xpath_namespace( tree, "/Response/HotelResponse/" "OTA_HotelDescriptiveInfoRS/" "HotelDescriptiveContents/" "HotelDescriptiveContent") for content_elem in content_elems: item_hotel_code = None item_hotel_city_code = task.kwargs.get('citycode') try: item_hotel_code = content_elem.attrib.get('HotelCode') item_hotel_name = content_elem.attrib.get('HotelName') item_hotel_brand_id = \ content_elem.attrib.get('BrandCode') position_elem = flist( xpath_namespace(content_elem, "HotelInfo//Position"), None) item_hotel_latitude = "" if position_elem is None \ or "Latitude" not in position_elem.attrib \ else position_elem.attrib.get('Latitude') item_hotel_longitude = "" if position_elem is None \ or "Longitude" not in position_elem.attrib \ else position_elem.attrib.get('Longitude') service_elems = xpath_namespace( content_elem, "HotelInfo/Services/Service") item_hotel_service = u"、".join([ flist( service.xpath( "*[local-name()='DescriptiveText']/text()") ) for service in service_elems if "Code" in service.attrib and service.attrib["Code"] in HOTEL_SERVICE_CODES ]) item_room_service = u"、".join([ flist( service.xpath( "*[local-name()='DescriptiveText']/text()") ) for service in service_elems if "Code" in service.attrib and service.attrib["Code"] in ROOM_SERVICE_CODES ]) awards_elem = flist( xpath_namespace(content_elem, "AffiliationInfo/Awards"), None) item_hotel_star, item_hotel_rate = ("", "") \ if awards_elem is None else \ (flist(awards_elem.xpath( "*[local-name()='Award' and " "@Provider='HotelStarRate']/@Rating")), flist(awards_elem.xpath( "*[local-name()='Award' and " "@Provider='CtripStarRate']/@Rating"))) multimedia_elem = flist( xpath_namespace(content_elem, "MultimediaDescriptions"), None) image_elems = [] if multimedia_elem is None \ else xpath_namespace( multimedia_elem, "MultimediaDescription/ImageItems/ImageItem") item_image_list = [] for image_elem in image_elems: image_url = flist( image_elem.xpath( "*[local-name()='ImageFormat']/" "*[local-name()='URL']/text()")) image_type = flist(image_elem.xpath("@Category")) if not image_url and not image_type: continue image_text = flist( image_elem.xpath( "*[local-name()='Description']/@Caption")) item_image_dict = { "image_url": image_url, "image_type": image_type, "image_text": image_text.encode('utf-8') } item_image_list.append(item_image_dict) if item_hotel_code and image_url: image_item = ImageItem(item_hotel_code, str(image_type), unicode(image_text), str(image_url)) yield image_item text_items_elem = flist( xpath_namespace(multimedia_elem, "MultimediaDescription/TextItems"), None) item_hotel_preview = "" if text_items_elem is None \ else flist(text_items_elem.xpath( "*[local-name()='TextItem' and @Category='5']/" "*[local-name()='Description']/text()")) room_elems = xpath_namespace( content_elem, "FacilityInfo/" "GuestRooms/GuestRoom") item_room_list = [] for room_elem in room_elems: room_info_id = flist( room_elem.xpath( "*[local-name()='TypeRoom']/@RoomTypeCode") ) room_info_name = flist( room_elem.xpath("@RoomTypeName")) room_bed_type = flist( room_elem.xpath( "*[local-name()='TypeRoom']/@BedTypeCode")) room_net_service, room_net_service_fee = \ _extract_net_service(room_elem) room_info_rate_price = "" room_hot = "" room_floor = flist( room_elem.xpath( "*[local-name()='TypeRoom']/@Floor")) room_breakfast = "" room_area = "" room_info_dict = { 'roomInfo_id': room_info_id, 'roomInfo_ratePrice': room_info_rate_price, 'hot': room_hot } if room_info_id and room_info_name and \ item_hotel_code: item_room_list.append(room_info_dict) room_item = RoomInfoItem( item_hotel_code, str(room_info_id), room_info_name, room_floor, room_net_service, room_net_service_fee, room_bed_type, room_breakfast, room_area) yield room_item item_hotel_address = "" if item_hotel_code not in \ hotel_address_dict \ else hotel_address_dict.get(item_hotel_code) hotel_item = HotelInfoItem( item_hotel_code, item_hotel_city_code, item_hotel_name, item_hotel_brand_id, item_hotel_latitude, item_hotel_longitude, item_hotel_service, item_room_service, item_hotel_star, item_hotel_rate, item_image_list, item_hotel_preview, item_room_list, item_hotel_address) yield hotel_item except Exception, e: self.logger.warn("one hotel extract error:%s" % e) if item_hotel_code is None: self.logger.error("i am sorry, i can do noting") else: chinese_name = task.kwargs.get('chinesename') yield build_rooms_task_for_hotel( [item_hotel_code], item_hotel_city_code, chinese_name, hotel_address_dict)
def parse(self, task, input_file): """parse response result Args: task: FileTask or HttpTask input_file: file or StringIO Yields: item: Item, result of parse task: Task, new task """ self.logger.info("hotel parser begin to parse") try: try: soap_tree = etree.fromstring(input_file.read()) except Exception, e: self.logger.error("not complete xml:%s" % e) raise ParserError("not complete xml") soap_elems = xpath_namespace(soap_tree, "/soap:Envelope/soap:Body/" "RequestResponse/RequestResult") xml_str = soap_elems[0].text tree = etree.fromstring(xml_str) elems = tree.xpath("/Response/Header") header = elems[0] if "ResultCode" not in header.attrib or \ header.attrib['ResultCode'] != "Success": self.logger.error("not has resultcode or " "resultcode is not success") raise ParserError("ResultCode error") else: # success property_elems = xpath_namespace( tree, "/Response/HotelResponse/OTA_HotelSearchRS/" "Properties/Property") city_code = task.kwargs.get('citycode') chinese_name = task.kwargs.get('chinesename') hotel_requests = list() hotel_addresses = dict() for property_elem in property_elems: hotel_code = str(property_elem.attrib['HotelCode']) \ if "HotelCode" in property_elem.attrib \ else None hotel_ctrip_city_code = str( property_elem.attrib['HotelCityCode']) \ if "HotelCityCode" in property_elem.attrib else None hotel_address = flist(property_elem.xpath( "*[local-name()='Address']/" "*[local-name()='AddressLine']/text()")) if isinstance(hotel_address, unicode): hotel_address = hotel_address.encode("utf-8") hotel_address = str(hotel_address) if hotel_code and hotel_ctrip_city_code: hotel_url = build_hotel_url(hotel_code) yield HotelCodeItem(hotel_code, city_code, hotel_url) hotel_requests.append(hotel_code) hotel_addresses[hotel_code] = hotel_address if len(hotel_requests) >= self.batch_count: yield build_rooms_task_for_hotel(hotel_requests, city_code, chinese_name, hotel_addresses) hotel_addresses.clear() del hotel_requests[:] # send left requests if len(hotel_requests) > 0: yield build_rooms_task_for_hotel(hotel_requests, city_code, chinese_name, hotel_addresses) hotel_addresses.clear() del hotel_requests[:]
def parse(self, task, input_file): """解析函数 Args: task: HTTPTask, 任务 input_file: StringIO, 网页信息 Yields: task: HTTPTask, 任务 item: Item, 解析的结果 """ self.logger.debug("attraction parser start to parse") parser = html.HTMLParser(encoding='utf-8') tree = html.parse(input_file, parser) try: name = flist(tree.xpath("//header[@class='title-head']/a/p/text()"), u"") play_spend, play_spend_unit = _extract_play_spend(tree) address = flist(tree.xpath("//div[@id='J-aside-info-address']" "/span[@class='val address-value']" "/text()"), u"") tel_phone = flist(tree.xpath("//div[@id='J-aside-info-phone']" "/span[@class='val phone-value']" "/text()"), u"") time_elems = tree.xpath("//div[@id='J-aside-info-opening_hours']" "/div[@class='val opening_hours-value']/p") time_list = [] for time_elem in time_elems: time_list.append(time_elem.text) open_time = "".join(time_list) total_score = flist(tree.xpath("//div[@class='scene-rating']" "/div/@content"), u"") ticket_info = flist(tree.xpath("//div[@id='J-aside-info-price']" "/div[@class='val price-value']" "/p/text()"), u"") preview = _extract_preview(tree) traffic = _extract_traffic(tree) tips = _extract_tips(tree) hot = flist(tree.xpath("//section[@id='remark-container']" "/div[@class='remark-overall-rating']" "/span[@class='remark-all-counts']" "/text()"), u"") lon_lat = task.kwargs['map_info'].split(",") if len(lon_lat) <= 1: lon, lat = u"", u"" else: lon, lat = lon_lat[0], lon_lat[1] seq_sort = task.kwargs['seq_sort'] sid = task.kwargs['sid'] attraction_item = AttractionItem(unicode(sid), unicode(name), unicode(play_spend), unicode(play_spend_unit), unicode(address), unicode(tel_phone), unicode(open_time), unicode(total_score), unicode(ticket_info), unicode(preview), unicode(hot), unicode(lon), unicode(lat), unicode(seq_sort), unicode(traffic), unicode(tips)) yield attraction_item # yield comment list task comments_request = build_comment_list_request( sid, task.kwargs['relate_path']) comments_task = HttpTask( comments_request, callback="CommentListParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={'sid': sid}) yield comments_task except Exception, e: self.logger.error("extract Attraction failed error:%s" % e) self.logger.error("error traceback:%s" % traceback.format_exc()) raise e
def parse(self, task, input_file): """parse response result Args: task: FileTask or HttpTask input_file: file or StringIO Yields: item: Item, result of parse task: Task, new task """ self.logger.info("hotel parser begin to parse") try: try: soap_tree = etree.fromstring(input_file.read()) except Exception, e: self.logger.error("not complete xml:%s" % e) raise ParserError("not complete xml") soap_elems = xpath_namespace( soap_tree, "/soap:Envelope/soap:Body/" "RequestResponse/RequestResult") xml_str = soap_elems[0].text tree = etree.fromstring(xml_str) elems = tree.xpath("/Response/Header") header = elems[0] if "ResultCode" not in header.attrib or \ header.attrib['ResultCode'] != "Success": self.logger.error("not has resultcode or " "resultcode is not success") raise ParserError("ResultCode error") else: # success property_elems = xpath_namespace( tree, "/Response/HotelResponse/OTA_HotelSearchRS/" "Properties/Property") city_code = task.kwargs.get('citycode') chinese_name = task.kwargs.get('chinesename') hotel_requests = list() hotel_addresses = dict() for property_elem in property_elems: hotel_code = str(property_elem.attrib['HotelCode']) \ if "HotelCode" in property_elem.attrib \ else None hotel_ctrip_city_code = str( property_elem.attrib['HotelCityCode']) \ if "HotelCityCode" in property_elem.attrib else None hotel_address = flist( property_elem.xpath( "*[local-name()='Address']/" "*[local-name()='AddressLine']/text()")) if isinstance(hotel_address, unicode): hotel_address = hotel_address.encode("utf-8") hotel_address = str(hotel_address) if hotel_code and hotel_ctrip_city_code: hotel_url = build_hotel_url(hotel_code) yield HotelCodeItem(hotel_code, city_code, hotel_url) hotel_requests.append(hotel_code) hotel_addresses[hotel_code] = hotel_address if len(hotel_requests) >= self.batch_count: yield build_rooms_task_for_hotel( hotel_requests, city_code, chinese_name, hotel_addresses) hotel_addresses.clear() del hotel_requests[:] # send left requests if len(hotel_requests) > 0: yield build_rooms_task_for_hotel(hotel_requests, city_code, chinese_name, hotel_addresses) hotel_addresses.clear() del hotel_requests[:]
def parse(self, task, input_file): """解析函数 Args: task: HTTPTask, 任务 input_file: StringIO, 网页信息 Yields: task: HTTPTask, 任务 item: Item, 解析的结果 """ self.logger.debug("attraction parser start to parse") parser = html.HTMLParser(encoding='utf-8') tree = html.parse(input_file, parser) try: name = flist( tree.xpath("//header[@class='title-head']/a/p/text()"), u"") play_spend, play_spend_unit = _extract_play_spend(tree) address = flist( tree.xpath("//div[@id='J-aside-info-address']" "/span[@class='val address-value']" "/text()"), u"") tel_phone = flist( tree.xpath("//div[@id='J-aside-info-phone']" "/span[@class='val phone-value']" "/text()"), u"") time_elems = tree.xpath("//div[@id='J-aside-info-opening_hours']" "/div[@class='val opening_hours-value']/p") time_list = [] for time_elem in time_elems: time_list.append(time_elem.text) open_time = "".join(time_list) total_score = flist( tree.xpath("//div[@class='scene-rating']" "/div/@content"), u"") ticket_info = flist( tree.xpath("//div[@id='J-aside-info-price']" "/div[@class='val price-value']" "/p/text()"), u"") preview = _extract_preview(tree) traffic = _extract_traffic(tree) tips = _extract_tips(tree) hot = flist( tree.xpath("//section[@id='remark-container']" "/div[@class='remark-overall-rating']" "/span[@class='remark-all-counts']" "/text()"), u"") lon_lat = task.kwargs['map_info'].split(",") if len(lon_lat) <= 1: lon, lat = u"", u"" else: lon, lat = lon_lat[0], lon_lat[1] seq_sort = task.kwargs['seq_sort'] sid = task.kwargs['sid'] attraction_item = AttractionItem(unicode(sid), unicode(name), unicode(play_spend), unicode(play_spend_unit), unicode(address), unicode(tel_phone), unicode(open_time), unicode(total_score), unicode(ticket_info), unicode(preview), unicode(hot), unicode(lon), unicode(lat), unicode(seq_sort), unicode(traffic), unicode(tips)) yield attraction_item # yield comment list task comments_request = build_comment_list_request( sid, task.kwargs['relate_path']) comments_task = HttpTask(comments_request, callback="CommentListParser", max_fail_count=3, cookie_host=LVYOU_HOST, kwargs={'sid': sid}) yield comments_task except Exception, e: self.logger.error("extract Attraction failed error:%s" % e) self.logger.error("error traceback:%s" % traceback.format_exc()) raise e
def parse(self, task, input_file): """parse method Args: task: Task, task input_file: file: file with content Yields: item: Item, result of parse task: Task, new task """ self.logger.debug("room parser begin to parse") try: try: soap_tree = etree.fromstring(input_file.read()) except Exception, e: self.logger.error("not complete xml:%s" % e) raise ParserError("not complete xml") hotel_address_dict = task.kwargs.get('address') soap_elems = xpath_namespace(soap_tree, "/soap:Envelope/soap:Body/" "RequestResponse/RequestResult") xml_str = soap_elems[0].text tree = etree.fromstring(xml_str) elems = tree.xpath("/Response/Header") header = elems[0] if "ResultCode" not in header.attrib or \ header.attrib['ResultCode'] != "Success": self.logger.error("not has resultcode or " "resultcode is not success") raise ParserError("ResultCode error") else: content_elems = xpath_namespace(tree, "/Response/HotelResponse/" "OTA_HotelDescriptiveInfoRS/" "HotelDescriptiveContents/" "HotelDescriptiveContent") for content_elem in content_elems: item_hotel_code = None item_hotel_city_code = task.kwargs.get('citycode') try: item_hotel_code = content_elem.attrib.get('HotelCode') item_hotel_name = content_elem.attrib.get('HotelName') item_hotel_brand_id = \ content_elem.attrib.get('BrandCode') position_elem = flist( xpath_namespace(content_elem, "HotelInfo//Position"), None) item_hotel_latitude = "" if position_elem is None \ or "Latitude" not in position_elem.attrib \ else position_elem.attrib.get('Latitude') item_hotel_longitude = "" if position_elem is None \ or "Longitude" not in position_elem.attrib \ else position_elem.attrib.get('Longitude') service_elems = xpath_namespace( content_elem, "HotelInfo/Services/Service") item_hotel_service = u"、".join( [flist(service.xpath( "*[local-name()='DescriptiveText']/text()")) for service in service_elems if "Code" in service.attrib and service.attrib["Code"] in HOTEL_SERVICE_CODES]) item_room_service = u"、".join( [flist(service.xpath( "*[local-name()='DescriptiveText']/text()")) for service in service_elems if "Code" in service.attrib and service.attrib["Code"] in ROOM_SERVICE_CODES]) awards_elem = flist( xpath_namespace(content_elem, "AffiliationInfo/Awards"), None) item_hotel_star, item_hotel_rate = ("", "") \ if awards_elem is None else \ (flist(awards_elem.xpath( "*[local-name()='Award' and " "@Provider='HotelStarRate']/@Rating")), flist(awards_elem.xpath( "*[local-name()='Award' and " "@Provider='CtripStarRate']/@Rating"))) multimedia_elem = flist( xpath_namespace(content_elem, "MultimediaDescriptions"), None) image_elems = [] if multimedia_elem is None \ else xpath_namespace( multimedia_elem, "MultimediaDescription/ImageItems/ImageItem") item_image_list = [] for image_elem in image_elems: image_url = flist(image_elem.xpath( "*[local-name()='ImageFormat']/" "*[local-name()='URL']/text()")) image_type = flist(image_elem.xpath("@Category")) if not image_url and not image_type: continue image_text = flist(image_elem.xpath( "*[local-name()='Description']/@Caption")) item_image_dict = {"image_url": image_url, "image_type": image_type, "image_text": image_text.encode('utf-8')} item_image_list.append(item_image_dict) if item_hotel_code and image_url: image_item = ImageItem(item_hotel_code, str(image_type), unicode(image_text), str(image_url)) yield image_item text_items_elem = flist(xpath_namespace( multimedia_elem, "MultimediaDescription/TextItems"), None) item_hotel_preview = "" if text_items_elem is None \ else flist(text_items_elem.xpath( "*[local-name()='TextItem' and @Category='5']/" "*[local-name()='Description']/text()")) room_elems = xpath_namespace(content_elem, "FacilityInfo/" "GuestRooms/GuestRoom") item_room_list = [] for room_elem in room_elems: room_info_id = flist(room_elem.xpath( "*[local-name()='TypeRoom']/@RoomTypeCode")) room_info_name = flist(room_elem.xpath( "@RoomTypeName")) room_bed_type = flist(room_elem.xpath( "*[local-name()='TypeRoom']/@BedTypeCode")) room_net_service, room_net_service_fee = \ _extract_net_service(room_elem) room_info_rate_price = "" room_hot = "" room_floor = flist(room_elem.xpath( "*[local-name()='TypeRoom']/@Floor")) room_breakfast = "" room_area = "" room_info_dict = {'roomInfo_id': room_info_id, 'roomInfo_ratePrice': room_info_rate_price, 'hot': room_hot} if room_info_id and room_info_name and \ item_hotel_code: item_room_list.append(room_info_dict) room_item = RoomInfoItem(item_hotel_code, str(room_info_id), room_info_name, room_floor, room_net_service, room_net_service_fee, room_bed_type, room_breakfast, room_area) yield room_item item_hotel_address = "" if item_hotel_code not in \ hotel_address_dict \ else hotel_address_dict.get(item_hotel_code) hotel_item = HotelInfoItem(item_hotel_code, item_hotel_city_code, item_hotel_name, item_hotel_brand_id, item_hotel_latitude, item_hotel_longitude, item_hotel_service, item_room_service, item_hotel_star, item_hotel_rate, item_image_list, item_hotel_preview, item_room_list, item_hotel_address) yield hotel_item except Exception, e: self.logger.warn("one hotel extract error:%s" % e) if item_hotel_code is None: self.logger.error("i am sorry, i can do noting") else: chinese_name = task.kwargs.get('chinesename') yield build_rooms_task_for_hotel( [item_hotel_code], item_hotel_city_code, chinese_name, hotel_address_dict)