def parse(self, task, input_file): """parse city xml file Args: task: HttpTask or FileTask, input_file: file, response body or file Yields: item: Item, result of parse task: Task, new task """ self.logger.info("city parser begin to parse") try: tree = etree.parse(input_file) elems = tree.xpath("//CityDetail") for elem in elems: chinese_name = remove_white(elem.findtext("CityName", "")) city_code = get_city_code( remove_white(elem.findtext("CityName", ""))) ctrip_code = remove_white(elem.findtext("City", "")) if len(chinese_name) <= 0 or not city_code or \ len(city_code) <= 0 or len(ctrip_code) <= 0: self.logger.info("invaliade city chinese_name:%s " "citycode:%s ctrip_code:%s" % (chinese_name, city_code, ctrip_code)) continue yield build_hotels_task_for_city(ctrip_code, city_code, chinese_name) yield CityItem(chinese_name, ctrip_code, city_code) except Exception, e: self.logger.error("city parser extract error:%s" % e)
def parse(self, task, input_file): """parse city xml file Args: task: HttpTask or FileTask, input_file: file, response body or file Yields: item: Item, result of parse task: Task, new task """ self.logger.info("city parser begin to parse") try: tree = etree.parse(input_file) elems = tree.xpath("//CityDetail") for elem in elems: chinese_name = remove_white(elem.findtext("CityName", "")) city_code = get_city_code(remove_white( elem.findtext("CityName", ""))) ctrip_code = remove_white(elem.findtext("City", "")) if len(chinese_name) <= 0 or not city_code or \ len(city_code) <= 0 or len(ctrip_code) <= 0: self.logger.info("invaliade city chinese_name:%s " "citycode:%s ctrip_code:%s" % (chinese_name, city_code, ctrip_code)) continue yield build_hotels_task_for_city(ctrip_code, city_code, chinese_name) yield CityItem(chinese_name, ctrip_code, city_code) except Exception, e: self.logger.error("city parser extract error:%s" % e)
def parse(self, task, input_file): """解析函数 Args: task:Task, 任务描述 input_file:File, 文件对象 Yields: Item Task """ tree = html.parse(input_file) citys = tree.xpath("//p[@id='citypid']/text()") citys = citys[0] if citys is not None and len(citys) > 0 else "" for city in citys.split(u","): city_english_name = remove_white(city) if len(city_english_name) > 0: city_item = CityItem("", city_english_name, get_city_code(city_english_name)) if city_item.english_name and city_item.city_code: yield city_item http_request = HTTPRequest(url=build_url_by_city_name( city_item.english_name), connect_timeout=20, request_timeout=240) new_task = HttpTask( http_request, callback='DealParser', max_fail_count=5, kwargs={'citycode': city_item.city_code}) yield new_task
def parse(self, task, input_file): """解析函数 Args: task:Task, 任务描述 input_file:File, 文件对象 Yields: Item Task """ tree = html.parse(input_file) citys = tree.xpath("//p[@id='citypid']/text()") citys = citys[0] if citys is not None and len(citys) > 0 else "" for city in citys.split(u","): city_english_name = remove_white(city) if len(city_english_name) > 0: city_item = CityItem("", city_english_name, get_city_code(city_english_name)) if city_item.english_name and city_item.city_code: yield city_item http_request = HTTPRequest( url=build_url_by_city_name(city_item.english_name), connect_timeout=20, request_timeout=240) new_task = HttpTask(http_request, callback='DealParser', max_fail_count=5, kwargs={'citycode': city_item.city_code}) yield new_task
def _extract_content_text(data_element): """用于解析出content_text值 Args: data_element: Element, 节点 Returns: content_text: str, 已处理的str """ content_text = data_element.findtext("content_text", "") content_text = content_text.replace("-", "")\ .replace(" ", " ")\ .replace("<br />", "N_Line")\ .replace("<br/>", "N_Line") tree = etree.HTML(content_text) temp_texts = [] for text in tree.itertext(): stripped_text = remove_white(text) if len(stripped_text) > 0: temp_texts.append(stripped_text) if len(temp_texts) > 0: last_text = temp_texts.pop().strip() if last_text.endswith("N_Line") and len(last_text) > 6: temp_texts.append(last_text[:-6]) else: temp_texts.append(last_text) if len(temp_texts) > 0: texts = [u"-"] texts.extend(temp_texts) complete_texts = "".join(texts) else: complete_texts = "" new_text = re.sub("\s*N_Line\s*", "N_Line-", complete_texts) return new_text
def _extract_place(data_element): """从api中解析出地址信息列表 Args: tree:Etree, 树节点 Returns: places:List, 地址列表 """ places = data_element.xpath("places//place") item_places = [] for place in places: place_info = { 'place_name': remove_white(place.findtext("place_name", "")), 'address': remove_white(place.findtext("address", "")), 'place_phone': remove_white(place.findtext("place_phone", "")), 'longitude': remove_white(place.findtext("longitude", "")), 'latitude': remove_white(place.findtext("latitude", "")), 'open_time': remove_white(place.findtext("open_time", "")), } item_places.append(place_info) return item_places