def parse(self, task, input_file): """parse response result Args: task: FileTask or HttpTask input_file: file or StringIO Yields: item: Item, result of parse task: Task, new task """ self.logger.info("hotel parser begin to parse") try: try: soap_tree = etree.fromstring(input_file.read()) except Exception, e: self.logger.error("not complete xml:%s" % e) raise ParserError("not complete xml") soap_elems = xpath_namespace( soap_tree, "/soap:Envelope/soap:Body/" "RequestResponse/RequestResult") xml_str = soap_elems[0].text tree = etree.fromstring(xml_str) elems = tree.xpath("/Response/Header") header = elems[0] if "ResultCode" not in header.attrib or \ header.attrib['ResultCode'] != "Success": self.logger.error("not has resultcode or " "resultcode is not success") raise ParserError("ResultCode error") else: # success property_elems = xpath_namespace( tree, "/Response/HotelResponse/OTA_HotelSearchRS/" "Properties/Property") city_code = task.kwargs.get('citycode') chinese_name = task.kwargs.get('chinesename') hotel_requests = list() hotel_addresses = dict() for property_elem in property_elems: hotel_code = str(property_elem.attrib['HotelCode']) \ if "HotelCode" in property_elem.attrib \ else None hotel_ctrip_city_code = str( property_elem.attrib['HotelCityCode']) \ if "HotelCityCode" in property_elem.attrib else None hotel_address = flist( property_elem.xpath( "*[local-name()='Address']/" "*[local-name()='AddressLine']/text()")) if isinstance(hotel_address, unicode): hotel_address = hotel_address.encode("utf-8") hotel_address = str(hotel_address) if hotel_code and hotel_ctrip_city_code: hotel_url = build_hotel_url(hotel_code) yield HotelCodeItem(hotel_code, city_code, hotel_url) hotel_requests.append(hotel_code) hotel_addresses[hotel_code] = hotel_address if len(hotel_requests) >= self.batch_count: yield build_rooms_task_for_hotel( hotel_requests, city_code, chinese_name, hotel_addresses) hotel_addresses.clear() del hotel_requests[:] # send left requests if len(hotel_requests) > 0: yield build_rooms_task_for_hotel(hotel_requests, city_code, chinese_name, hotel_addresses) hotel_addresses.clear() del hotel_requests[:]
def parse(self, task, input_file): """parse response result Args: task: FileTask or HttpTask input_file: file or StringIO Yields: item: Item, result of parse task: Task, new task """ self.logger.info("hotel parser begin to parse") try: try: soap_tree = etree.fromstring(input_file.read()) except Exception, e: self.logger.error("not complete xml:%s" % e) raise ParserError("not complete xml") soap_elems = xpath_namespace(soap_tree, "/soap:Envelope/soap:Body/" "RequestResponse/RequestResult") xml_str = soap_elems[0].text tree = etree.fromstring(xml_str) elems = tree.xpath("/Response/Header") header = elems[0] if "ResultCode" not in header.attrib or \ header.attrib['ResultCode'] != "Success": self.logger.error("not has resultcode or " "resultcode is not success") raise ParserError("ResultCode error") else: # success property_elems = xpath_namespace( tree, "/Response/HotelResponse/OTA_HotelSearchRS/" "Properties/Property") city_code = task.kwargs.get('citycode') chinese_name = task.kwargs.get('chinesename') hotel_requests = list() hotel_addresses = dict() for property_elem in property_elems: hotel_code = str(property_elem.attrib['HotelCode']) \ if "HotelCode" in property_elem.attrib \ else None hotel_ctrip_city_code = str( property_elem.attrib['HotelCityCode']) \ if "HotelCityCode" in property_elem.attrib else None hotel_address = flist(property_elem.xpath( "*[local-name()='Address']/" "*[local-name()='AddressLine']/text()")) if isinstance(hotel_address, unicode): hotel_address = hotel_address.encode("utf-8") hotel_address = str(hotel_address) if hotel_code and hotel_ctrip_city_code: hotel_url = build_hotel_url(hotel_code) yield HotelCodeItem(hotel_code, city_code, hotel_url) hotel_requests.append(hotel_code) hotel_addresses[hotel_code] = hotel_address if len(hotel_requests) >= self.batch_count: yield build_rooms_task_for_hotel(hotel_requests, city_code, chinese_name, hotel_addresses) hotel_addresses.clear() del hotel_requests[:] # send left requests if len(hotel_requests) > 0: yield build_rooms_task_for_hotel(hotel_requests, city_code, chinese_name, hotel_addresses) hotel_addresses.clear() del hotel_requests[:]
def parse(self, task, input_file): """parse method Args: task: Task, task input_file: file: file with content Yields: item: Item, result of parse task: Task, new task """ self.logger.debug("room parser begin to parse") try: try: soap_tree = etree.fromstring(input_file.read()) except Exception, e: self.logger.error("not complete xml:%s" % e) raise ParserError("not complete xml") hotel_address_dict = task.kwargs.get('address') soap_elems = xpath_namespace( soap_tree, "/soap:Envelope/soap:Body/" "RequestResponse/RequestResult") xml_str = soap_elems[0].text tree = etree.fromstring(xml_str) elems = tree.xpath("/Response/Header") header = elems[0] if "ResultCode" not in header.attrib or \ header.attrib['ResultCode'] != "Success": self.logger.error("not has resultcode or " "resultcode is not success") raise ParserError("ResultCode error") else: content_elems = xpath_namespace( tree, "/Response/HotelResponse/" "OTA_HotelDescriptiveInfoRS/" "HotelDescriptiveContents/" "HotelDescriptiveContent") for content_elem in content_elems: item_hotel_code = None item_hotel_city_code = task.kwargs.get('citycode') try: item_hotel_code = content_elem.attrib.get('HotelCode') item_hotel_name = content_elem.attrib.get('HotelName') item_hotel_brand_id = \ content_elem.attrib.get('BrandCode') position_elem = flist( xpath_namespace(content_elem, "HotelInfo//Position"), None) item_hotel_latitude = "" if position_elem is None \ or "Latitude" not in position_elem.attrib \ else position_elem.attrib.get('Latitude') item_hotel_longitude = "" if position_elem is None \ or "Longitude" not in position_elem.attrib \ else position_elem.attrib.get('Longitude') service_elems = xpath_namespace( content_elem, "HotelInfo/Services/Service") item_hotel_service = u"、".join([ flist( service.xpath( "*[local-name()='DescriptiveText']/text()") ) for service in service_elems if "Code" in service.attrib and service.attrib["Code"] in HOTEL_SERVICE_CODES ]) item_room_service = u"、".join([ flist( service.xpath( "*[local-name()='DescriptiveText']/text()") ) for service in service_elems if "Code" in service.attrib and service.attrib["Code"] in ROOM_SERVICE_CODES ]) awards_elem = flist( xpath_namespace(content_elem, "AffiliationInfo/Awards"), None) item_hotel_star, item_hotel_rate = ("", "") \ if awards_elem is None else \ (flist(awards_elem.xpath( "*[local-name()='Award' and " "@Provider='HotelStarRate']/@Rating")), flist(awards_elem.xpath( "*[local-name()='Award' and " "@Provider='CtripStarRate']/@Rating"))) multimedia_elem = flist( xpath_namespace(content_elem, "MultimediaDescriptions"), None) image_elems = [] if multimedia_elem is None \ else xpath_namespace( multimedia_elem, "MultimediaDescription/ImageItems/ImageItem") item_image_list = [] for image_elem in image_elems: image_url = flist( image_elem.xpath( "*[local-name()='ImageFormat']/" "*[local-name()='URL']/text()")) image_type = flist(image_elem.xpath("@Category")) if not image_url and not image_type: continue image_text = flist( image_elem.xpath( "*[local-name()='Description']/@Caption")) item_image_dict = { "image_url": image_url, "image_type": image_type, "image_text": image_text.encode('utf-8') } item_image_list.append(item_image_dict) if item_hotel_code and image_url: image_item = ImageItem(item_hotel_code, str(image_type), unicode(image_text), str(image_url)) yield image_item text_items_elem = flist( xpath_namespace(multimedia_elem, "MultimediaDescription/TextItems"), None) item_hotel_preview = "" if text_items_elem is None \ else flist(text_items_elem.xpath( "*[local-name()='TextItem' and @Category='5']/" "*[local-name()='Description']/text()")) room_elems = xpath_namespace( content_elem, "FacilityInfo/" "GuestRooms/GuestRoom") item_room_list = [] for room_elem in room_elems: room_info_id = flist( room_elem.xpath( "*[local-name()='TypeRoom']/@RoomTypeCode") ) room_info_name = flist( room_elem.xpath("@RoomTypeName")) room_bed_type = flist( room_elem.xpath( "*[local-name()='TypeRoom']/@BedTypeCode")) room_net_service, room_net_service_fee = \ _extract_net_service(room_elem) room_info_rate_price = "" room_hot = "" room_floor = flist( room_elem.xpath( "*[local-name()='TypeRoom']/@Floor")) room_breakfast = "" room_area = "" room_info_dict = { 'roomInfo_id': room_info_id, 'roomInfo_ratePrice': room_info_rate_price, 'hot': room_hot } if room_info_id and room_info_name and \ item_hotel_code: item_room_list.append(room_info_dict) room_item = RoomInfoItem( item_hotel_code, str(room_info_id), room_info_name, room_floor, room_net_service, room_net_service_fee, room_bed_type, room_breakfast, room_area) yield room_item item_hotel_address = "" if item_hotel_code not in \ hotel_address_dict \ else hotel_address_dict.get(item_hotel_code) hotel_item = HotelInfoItem( item_hotel_code, item_hotel_city_code, item_hotel_name, item_hotel_brand_id, item_hotel_latitude, item_hotel_longitude, item_hotel_service, item_room_service, item_hotel_star, item_hotel_rate, item_image_list, item_hotel_preview, item_room_list, item_hotel_address) yield hotel_item except Exception, e: self.logger.warn("one hotel extract error:%s" % e) if item_hotel_code is None: self.logger.error("i am sorry, i can do noting") else: chinese_name = task.kwargs.get('chinesename') yield build_rooms_task_for_hotel( [item_hotel_code], item_hotel_city_code, chinese_name, hotel_address_dict)
def parse(self, task, input_file): """parse method Args: task: Task, task input_file: file: file with content Yields: item: Item, result of parse task: Task, new task """ self.logger.debug("room parser begin to parse") try: try: soap_tree = etree.fromstring(input_file.read()) except Exception, e: self.logger.error("not complete xml:%s" % e) raise ParserError("not complete xml") hotel_address_dict = task.kwargs.get('address') soap_elems = xpath_namespace(soap_tree, "/soap:Envelope/soap:Body/" "RequestResponse/RequestResult") xml_str = soap_elems[0].text tree = etree.fromstring(xml_str) elems = tree.xpath("/Response/Header") header = elems[0] if "ResultCode" not in header.attrib or \ header.attrib['ResultCode'] != "Success": self.logger.error("not has resultcode or " "resultcode is not success") raise ParserError("ResultCode error") else: content_elems = xpath_namespace(tree, "/Response/HotelResponse/" "OTA_HotelDescriptiveInfoRS/" "HotelDescriptiveContents/" "HotelDescriptiveContent") for content_elem in content_elems: item_hotel_code = None item_hotel_city_code = task.kwargs.get('citycode') try: item_hotel_code = content_elem.attrib.get('HotelCode') item_hotel_name = content_elem.attrib.get('HotelName') item_hotel_brand_id = \ content_elem.attrib.get('BrandCode') position_elem = flist( xpath_namespace(content_elem, "HotelInfo//Position"), None) item_hotel_latitude = "" if position_elem is None \ or "Latitude" not in position_elem.attrib \ else position_elem.attrib.get('Latitude') item_hotel_longitude = "" if position_elem is None \ or "Longitude" not in position_elem.attrib \ else position_elem.attrib.get('Longitude') service_elems = xpath_namespace( content_elem, "HotelInfo/Services/Service") item_hotel_service = u"、".join( [flist(service.xpath( "*[local-name()='DescriptiveText']/text()")) for service in service_elems if "Code" in service.attrib and service.attrib["Code"] in HOTEL_SERVICE_CODES]) item_room_service = u"、".join( [flist(service.xpath( "*[local-name()='DescriptiveText']/text()")) for service in service_elems if "Code" in service.attrib and service.attrib["Code"] in ROOM_SERVICE_CODES]) awards_elem = flist( xpath_namespace(content_elem, "AffiliationInfo/Awards"), None) item_hotel_star, item_hotel_rate = ("", "") \ if awards_elem is None else \ (flist(awards_elem.xpath( "*[local-name()='Award' and " "@Provider='HotelStarRate']/@Rating")), flist(awards_elem.xpath( "*[local-name()='Award' and " "@Provider='CtripStarRate']/@Rating"))) multimedia_elem = flist( xpath_namespace(content_elem, "MultimediaDescriptions"), None) image_elems = [] if multimedia_elem is None \ else xpath_namespace( multimedia_elem, "MultimediaDescription/ImageItems/ImageItem") item_image_list = [] for image_elem in image_elems: image_url = flist(image_elem.xpath( "*[local-name()='ImageFormat']/" "*[local-name()='URL']/text()")) image_type = flist(image_elem.xpath("@Category")) if not image_url and not image_type: continue image_text = flist(image_elem.xpath( "*[local-name()='Description']/@Caption")) item_image_dict = {"image_url": image_url, "image_type": image_type, "image_text": image_text.encode('utf-8')} item_image_list.append(item_image_dict) if item_hotel_code and image_url: image_item = ImageItem(item_hotel_code, str(image_type), unicode(image_text), str(image_url)) yield image_item text_items_elem = flist(xpath_namespace( multimedia_elem, "MultimediaDescription/TextItems"), None) item_hotel_preview = "" if text_items_elem is None \ else flist(text_items_elem.xpath( "*[local-name()='TextItem' and @Category='5']/" "*[local-name()='Description']/text()")) room_elems = xpath_namespace(content_elem, "FacilityInfo/" "GuestRooms/GuestRoom") item_room_list = [] for room_elem in room_elems: room_info_id = flist(room_elem.xpath( "*[local-name()='TypeRoom']/@RoomTypeCode")) room_info_name = flist(room_elem.xpath( "@RoomTypeName")) room_bed_type = flist(room_elem.xpath( "*[local-name()='TypeRoom']/@BedTypeCode")) room_net_service, room_net_service_fee = \ _extract_net_service(room_elem) room_info_rate_price = "" room_hot = "" room_floor = flist(room_elem.xpath( "*[local-name()='TypeRoom']/@Floor")) room_breakfast = "" room_area = "" room_info_dict = {'roomInfo_id': room_info_id, 'roomInfo_ratePrice': room_info_rate_price, 'hot': room_hot} if room_info_id and room_info_name and \ item_hotel_code: item_room_list.append(room_info_dict) room_item = RoomInfoItem(item_hotel_code, str(room_info_id), room_info_name, room_floor, room_net_service, room_net_service_fee, room_bed_type, room_breakfast, room_area) yield room_item item_hotel_address = "" if item_hotel_code not in \ hotel_address_dict \ else hotel_address_dict.get(item_hotel_code) hotel_item = HotelInfoItem(item_hotel_code, item_hotel_city_code, item_hotel_name, item_hotel_brand_id, item_hotel_latitude, item_hotel_longitude, item_hotel_service, item_room_service, item_hotel_star, item_hotel_rate, item_image_list, item_hotel_preview, item_room_list, item_hotel_address) yield hotel_item except Exception, e: self.logger.warn("one hotel extract error:%s" % e) if item_hotel_code is None: self.logger.error("i am sorry, i can do noting") else: chinese_name = task.kwargs.get('chinesename') yield build_rooms_task_for_hotel( [item_hotel_code], item_hotel_city_code, chinese_name, hotel_address_dict)