コード例 #1
0
ファイル: parser.py プロジェクト: Yappawu/tigerspider
    def parse(self, task, input_file):
        """parse city xml file
            Args:
                task: HttpTask or FileTask,
                input_file: file, response body or file
            Yields:
                item: Item, result of parse
                task: Task, new task
        """
        self.logger.info("city parser begin to parse")
        try:
            tree = etree.parse(input_file)

            elems = tree.xpath("//CityDetail")
            for elem in elems:
                chinese_name = remove_white(elem.findtext("CityName", ""))
                city_code = get_city_code(
                    remove_white(elem.findtext("CityName", "")))
                ctrip_code = remove_white(elem.findtext("City", ""))

                if len(chinese_name) <= 0 or not city_code or \
                                len(city_code) <= 0 or len(ctrip_code) <= 0:
                    self.logger.info("invaliade city chinese_name:%s "
                                     "citycode:%s ctrip_code:%s" %
                                     (chinese_name, city_code, ctrip_code))
                    continue

                yield build_hotels_task_for_city(ctrip_code, city_code,
                                                 chinese_name)
                yield CityItem(chinese_name, ctrip_code, city_code)

        except Exception, e:
            self.logger.error("city parser extract error:%s" % e)
コード例 #2
0
ファイル: parser.py プロジェクト: JobsDong/tigerspider
    def parse(self, task, input_file):
        """parse city xml file
            Args:
                task: HttpTask or FileTask,
                input_file: file, response body or file
            Yields:
                item: Item, result of parse
                task: Task, new task
        """
        self.logger.info("city parser begin to parse")
        try:
            tree = etree.parse(input_file)

            elems = tree.xpath("//CityDetail")
            for elem in elems:
                chinese_name = remove_white(elem.findtext("CityName", ""))
                city_code = get_city_code(remove_white(
                    elem.findtext("CityName", "")))
                ctrip_code = remove_white(elem.findtext("City", ""))

                if len(chinese_name) <= 0 or not city_code or \
                                len(city_code) <= 0 or len(ctrip_code) <= 0:
                    self.logger.info("invaliade city chinese_name:%s "
                                     "citycode:%s ctrip_code:%s"
                                     % (chinese_name, city_code, ctrip_code))
                    continue

                yield build_hotels_task_for_city(ctrip_code, city_code,
                                                 chinese_name)
                yield CityItem(chinese_name, ctrip_code, city_code)

        except Exception, e:
            self.logger.error("city parser extract error:%s" % e)
コード例 #3
0
 def parse(self, task, input_file):
     """解析函数
         Args:
             task:Task, 任务描述
             input_file:File, 文件对象
         Yields:
             Item
             Task
     """
     tree = html.parse(input_file)
     citys = tree.xpath("//p[@id='citypid']/text()")
     citys = citys[0] if citys is not None and len(citys) > 0 else ""
     for city in citys.split(u","):
         city_english_name = remove_white(city)
         if len(city_english_name) > 0:
             city_item = CityItem("", city_english_name,
                                  get_city_code(city_english_name))
             if city_item.english_name and city_item.city_code:
                 yield city_item
                 http_request = HTTPRequest(url=build_url_by_city_name(
                     city_item.english_name),
                                            connect_timeout=20,
                                            request_timeout=240)
                 new_task = HttpTask(
                     http_request,
                     callback='DealParser',
                     max_fail_count=5,
                     kwargs={'citycode': city_item.city_code})
                 yield new_task
コード例 #4
0
ファイル: parser.py プロジェクト: JobsDong/tigerspider
 def parse(self, task, input_file):
     """解析函数
         Args:
             task:Task, 任务描述
             input_file:File, 文件对象
         Yields:
             Item
             Task
     """
     tree = html.parse(input_file)
     citys = tree.xpath("//p[@id='citypid']/text()")
     citys = citys[0] if citys is not None and len(citys) > 0 else ""
     for city in citys.split(u","):
         city_english_name = remove_white(city)
         if len(city_english_name) > 0:
             city_item = CityItem("", city_english_name,
                                  get_city_code(city_english_name))
             if city_item.english_name and city_item.city_code:
                 yield city_item
                 http_request = HTTPRequest(
                     url=build_url_by_city_name(city_item.english_name),
                     connect_timeout=20, request_timeout=240)
                 new_task = HttpTask(http_request, callback='DealParser',
                                     max_fail_count=5,
                                     kwargs={'citycode': city_item.city_code})
                 yield new_task
コード例 #5
0
def _extract_content_text(data_element):
    """用于解析出content_text值
        Args:
            data_element: Element, 节点
        Returns:
            content_text: str, 已处理的str
    """
    content_text = data_element.findtext("content_text", "")
    content_text = content_text.replace("-", "")\
        .replace("&nbsp;", " ")\
        .replace("<br />", "N_Line")\
        .replace("<br/>", "N_Line")

    tree = etree.HTML(content_text)
    temp_texts = []
    for text in tree.itertext():
        stripped_text = remove_white(text)
        if len(stripped_text) > 0:
            temp_texts.append(stripped_text)

    if len(temp_texts) > 0:
        last_text = temp_texts.pop().strip()
        if last_text.endswith("N_Line") and len(last_text) > 6:
            temp_texts.append(last_text[:-6])
        else:
            temp_texts.append(last_text)

    if len(temp_texts) > 0:
        texts = [u"-"]
        texts.extend(temp_texts)
        complete_texts = "".join(texts)
    else:
        complete_texts = ""

    new_text = re.sub("\s*N_Line\s*", "N_Line-", complete_texts)
    return new_text
コード例 #6
0
ファイル: parser.py プロジェクト: JobsDong/tigerspider
def _extract_content_text(data_element):
    """用于解析出content_text值
        Args:
            data_element: Element, 节点
        Returns:
            content_text: str, 已处理的str
    """
    content_text = data_element.findtext("content_text", "")
    content_text = content_text.replace("-", "")\
        .replace("&nbsp;", " ")\
        .replace("<br />", "N_Line")\
        .replace("<br/>", "N_Line")

    tree = etree.HTML(content_text)
    temp_texts = []
    for text in tree.itertext():
        stripped_text = remove_white(text)
        if len(stripped_text) > 0:
            temp_texts.append(stripped_text)

    if len(temp_texts) > 0:
        last_text = temp_texts.pop().strip()
        if last_text.endswith("N_Line") and len(last_text) > 6:
            temp_texts.append(last_text[:-6])
        else:
            temp_texts.append(last_text)

    if len(temp_texts) > 0:
        texts = [u"-"]
        texts.extend(temp_texts)
        complete_texts = "".join(texts)
    else:
        complete_texts = ""

    new_text = re.sub("\s*N_Line\s*", "N_Line-", complete_texts)
    return new_text
コード例 #7
0
def _extract_place(data_element):
    """从api中解析出地址信息列表
        Args:
            tree:Etree, 树节点
        Returns:
            places:List, 地址列表
    """
    places = data_element.xpath("places//place")
    item_places = []
    for place in places:
        place_info = {
            'place_name': remove_white(place.findtext("place_name", "")),
            'address': remove_white(place.findtext("address", "")),
            'place_phone': remove_white(place.findtext("place_phone", "")),
            'longitude': remove_white(place.findtext("longitude", "")),
            'latitude': remove_white(place.findtext("latitude", "")),
            'open_time': remove_white(place.findtext("open_time", "")),
        }
        item_places.append(place_info)
    return item_places
コード例 #8
0
ファイル: parser.py プロジェクト: JobsDong/tigerspider
def _extract_place(data_element):
    """从api中解析出地址信息列表
        Args:
            tree:Etree, 树节点
        Returns:
            places:List, 地址列表
    """
    places = data_element.xpath("places//place")
    item_places = []
    for place in places:
        place_info = {
            'place_name': remove_white(place.findtext("place_name", "")),
            'address': remove_white(place.findtext("address", "")),
            'place_phone': remove_white(place.findtext("place_phone", "")),
            'longitude': remove_white(place.findtext("longitude", "")),
            'latitude': remove_white(place.findtext("latitude", "")),
            'open_time': remove_white(place.findtext("open_time", "")),
        }
        item_places.append(place_info)
    return item_places