Esempi in Python per flist, esempi in Python per core.util.flist

Esempio n. 1

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

def _extract_comments(tree):
    """解析出评论
        Args:
            tree: ElementTree
        Returns:
            comments: list
    """
    comment_elems = tree.xpath("//div[@id='REVIEWS']/div[@class='reviewSelector']")
    comments = []
    for comment_elem in comment_elems:
        try:
            # 第一类型网页
            comment_user = flist(comment_elem.xpath(
                "div/div[@class='col1of2']"
                "//div[@class='username mo']/span/text()"), u"")
            comment_score = flist(comment_elem.xpath(
                "div/div[@class='col2of2 ']"
                "/div[@class='rating reviewItemInline']"
                "/span[@class='rate rate_s s50']/img/@content"), u"")
            comment_time = flist(comment_elem.xpath(
                "div/div[@class='col2of2 ']"
                "/div[@class='rating reviewItemInline']"
                "/span[@class='ratingDate']/text()"), u"")
            content_elems = comment_elem.xpath(
                "div/div[@class='col2of2 ']/div[@class='entry']/p")
            texts = []
            for content_elem in content_elems:
                if content_elem.text is not None:
                    texts.append(content_elem.text)
            comment_content = u"".join(texts).strip()

            # 第二类型网页
            if len(comment_user.strip()) == 0:
                comment_user = flist(comment_elem.xpath(
                    "div/div[@class='col1of2']"
                    "//div[@class='username']/span/text()"), u"")
            if len(comment_time.strip()) == 0:
                comment_time = flist(comment_elem.xpath(
                    "div/div[@class='col2of2']"
                    "/span[@class='ratingDate']/text()"), u"")
            if len(comment_content.strip()) == 0:
                content_elems = comment_elem.xpath(
                    "div/div[@class='col2of2']"
                    "/div[@class='entry']/p")
                texts = []
                for content_elem in content_elems:
                    if content_elem.text is not None:
                        texts.append(content_elem.text)
                comment_content = u"".join(texts).strip()

        except Exception, e:
            print "extract one comment failed error:%s" % e
            print traceback.format_exc()
        else:
            if len(unicode(comment_content)) != 0:
                comment_item = CommentItem(unicode(comment_user).strip(),
                                           unicode(comment_time).strip(),
                                           unicode(comment_score).strip(),
                                           unicode(comment_content).strip())
                comments.append(comment_item)

Esempio n. 2

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

    def parse(self, task, input_file):
        """解析函数
            Args:
                task: HttpTask，任务
                input_file: StringIO, 网页文件
            Yields:
                item
        """
        self.logger.debug("attraction parser start to parse")
        content = input_file.read()
        tree = html.parse(StringIO(content))
        try:
            zip_code = flist(tree.xpath("//span[@class='postal-code']/text()"), u"")
            play_spend, play_spend_unit = _extract_play_spend_and_unit(content)
            tel_phone = flist(tree.xpath("//div[@id='HEADING_GROUP']"
                                         "/div[@class='wrap infoBox']"
                                         "/div[@class='odcHotel blDetails']"
                                         "/div/div[@class='fl']/text()"), u"")
            open_time = u""
            total_score = flist(tree.xpath("//div[@class='rs rating']"
                                           "/span/img/@content"))
            ticket_info = u""
            preview_relate_path = flist(tree.xpath(
                "//div[@class='listing_description']/a/@href"), u"")
            lon, lat = _extract_lon_lat(flist(tree.xpath(
                "//div[@class='js_mapThumb']"
                "/div[@id='bmapContainer']/img[1]/@src"), u""))
            comments = _extract_comments(tree)
            # 生成景点信息(不包括description)
            attraction_item = AttractionItem(task.request.url,
                                             task.kwargs['name'],
                                             unicode(play_spend),
                                             play_spend_unit,
                                             task.kwargs['address'],
                                             unicode(tel_phone), unicode(open_time),
                                             unicode(total_score),
                                             unicode(ticket_info),
                                             task.kwargs['hot'],
                                             lon, lat, task.kwargs['rank'],
                                             comments,
                                             unicode(zip_code))
            yield attraction_item

            # 生成description任务
            if len(preview_relate_path) != 0:
                description_request = build_description_request(
                    task.request.url, preview_relate_path)
                description_task = HttpTask(description_request,
                                            callback="DescriptionParser",
                                            max_fail_count=3,
                                            cookie_host=LVYOU_HOST,
                                            kwargs={'url': task.request.url})
                yield description_task
            else:
                yield DescriptionItem(task.request.url, u"")

        except Exception, e:
            print "error:%s" % e
            print traceback.format_exc()

Esempio n. 3

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

def _extract_address(info_elem):
    """解析地址
        Args:
            info_elem: Element
        Returns:
            address: unicode, 地址信息
    """
    country = flist(info_elem.xpath("address/span[@class='country-name']/text()"), u"")
    city = flist(info_elem.xpath("address/span[@class='locality']/text()"), u"")
    streat_script = flist(info_elem.xpath("address/span/script/text()"), u"")
    streat = _extract_script_text(streat_script)

    return u"%s%s%s" % (country, city, streat)

Esempio n. 4

0

Mostra file

def _extract_address(info_elem):
    """解析地址
        Args:
            info_elem: Element
        Returns:
            address: unicode, 地址信息
    """
    country = flist(
        info_elem.xpath("address/span[@class='country-name']/text()"), u"")
    city = flist(info_elem.xpath("address/span[@class='locality']/text()"),
                 u"")
    streat_script = flist(info_elem.xpath("address/span/script/text()"), u"")
    streat = _extract_script_text(streat_script)

    return u"%s%s%s" % (country, city, streat)

Esempio n. 5

0

Mostra file

File: parser.py Progetto: Yappawu/tigerspider

 def parse(self, task, input_file):
     """详情解析器
         Args:
             task, HttpTask, 任务
             input_file: file, 网页文件
         Yields:
             item: WebItem, 数据
             task: HttpTask, 新任务
     """
     tree = html.parse(input_file)
     pic_url = unicode(
         flist(tree.xpath("//div[@class='product-price-left']"
                          "/p/img/@src"),
               default=u""))
     desc_elems = tree.xpath("//div[@class='product-detail-alla-cont']")
     description = _extract_desc_elems(desc_elems)
     date_elems = tree.xpath("//ul[@class='productnew-header-pricea2-ul"
                             " clearfloat']/li/@d")
     telephone = flist(tree.xpath("//div[@class='top-w']//li[@class='tel']"
                                  "/span/text()"),
                       default=u"")
     telephone = telephone.replace(u"-", u"")
     if len(telephone) == 0:
         telephone = u"4006228228"
     price_elems = tree.xpath("//ul[@class='productnew-header-pricec2-ul "
                              "productnew-"
                              "header-pricec3-ul productnew-header-"
                              "pricec2-cq']/li/@title")
     price_infos = list()
     for price_elem in price_elems:
         if unicode(price_elem) not in price_infos:
             price_infos.append(unicode(price_elem))
     price_info = u"/".join(price_infos)
     time_infos = []
     for date_elem in date_elems:
         time_infos.append(date_elem)
     time_info = u";".join(time_infos)
     url = task.kwargs.get('url')
     cookie_host = task.kwargs.get('cookie_host')
     cookie_count = task.kwargs.get('cookie_count')
     pictures, pic_task = self._check_and_execute_picture(
         pic_url, cookie_host, cookie_count)
     # 保存详情信息
     yield WebItem(url, telephone, description, pictures, time_info,
                   price_info, u"")
     # 抛出picTask
     if pic_task is not None:
         yield pic_task

Esempio n. 6

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

def _extract_play_spend(tree):
    """提取play spend和unit
    """
    play_spend = flist(tree.xpath(
        "//div[@id='J-aside-info-recommend_visit_time']"
        "/span[@class='val recommend_visit_time-value']/text()"), u"")
    if play_spend.find(u"天") != -1:
        play_spend_unit = u"天"
    elif play_spend.find(u"小时") != -1:
        play_spend_unit = u"小时"
    elif play_spend.find(u"分") != -1 or play_spend.find(u"分钟") != -1:
        play_spend_unit = u"分"
    elif len(play_spend.strip()) == 0:
        play_spend_unit = u""
    else:
        play_spend_unit = u""
    replace_dict = {
        u"小时": u"", u"天": u"", u"时": u"", u"小": u"", u"分钟": u"", u"分": u"",
        u"钟": u"",
        u"至": u"-", u"超过": u">", u"半": u".5", u"一": u"1",
        u"二": u"2", u"三": u"3", u"四": u"4", u"五": u"5",
        u"六": u"6", u"七": u"7", u"八": u"8", u"九": u"9",
    }

    for key, value in replace_dict.iteritems():
        play_spend = play_spend.replace(key, value)
    if play_spend.strip() == u".5":
        play_spend = u"0.5"

    return play_spend, play_spend_unit

Esempio n. 7

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

 def parse(self, task, input_file):
     """详情解析器
         Args:
             task, HttpTask, 任务
             input_file: file, 网页文件
         Yields:
             item: WebItem, 数据
             task: HttpTask, 新任务
     """
     tree = html.parse(input_file)
     pic_url = unicode(flist(tree.xpath("//div[@class='product-price-left']"
                                        "/p/img/@src"),
                             default=u""))
     desc_elems = tree.xpath("//div[@class='product-detail-alla-cont']")
     description = _extract_desc_elems(desc_elems)
     date_elems = tree.xpath("//ul[@class='productnew-header-pricea2-ul"
                             " clearfloat']/li/@d")
     telephone = flist(tree.xpath("//div[@class='top-w']//li[@class='tel']"
                                  "/span/text()"), default=u"")
     telephone = telephone.replace(u"-", u"")
     if len(telephone) == 0:
         telephone = u"4006228228"
     price_elems = tree.xpath("//ul[@class='productnew-header-pricec2-ul "
                              "productnew-"
                              "header-pricec3-ul productnew-header-"
                              "pricec2-cq']/li/@title")
     price_infos = list()
     for price_elem in price_elems:
         if unicode(price_elem) not in price_infos:
             price_infos.append(unicode(price_elem))
     price_info = u"/".join(price_infos)
     time_infos = []
     for date_elem in date_elems:
         time_infos.append(date_elem)
     time_info = u";".join(time_infos)
     url = task.kwargs.get('url')
     cookie_host = task.kwargs.get('cookie_host')
     cookie_count = task.kwargs.get('cookie_count')
     pictures, pic_task = self._check_and_execute_picture(pic_url,
                                                          cookie_host,
                                                          cookie_count)
     # 保存详情信息
     yield WebItem(url, telephone, description, pictures, time_info,
                   price_info, u"")
     # 抛出picTask
     if pic_task is not None:
         yield pic_task

Esempio n. 8

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

    def parse(self, task, input_file):
        """详情解析器
            Args:
                task, HttpTask, 任务
                input_file: file, 网页文件
            Yields:
                item: WebItem, 数据
                task: HttpTask, 新任务
        """
        tree = html.parse(input_file)
        name = flist(tree.xpath(u"//div["
                                u"@clas='product-price-titleul']/h1/text()"))
        desc_elems = tree.xpath(u"//div[@class='product-detail-alla-cont']")
        description = _extract_desc_elems(desc_elems)
        date_elems = tree.xpath(
            u"//ul[@class='productnew-header-pricea2-ul clearfloat']/li/@d")
        telephone = flist(tree.xpath(
            u"//div[@class='top-w']//li[@class='tel']/span/text()"))
        telephone = telephone.replace(u"-", u"")
        if len(telephone) == 0:
            telephone = u"4006228228"
        price_elems = tree.xpath(
            u"//ul[@class='productnew-header-pricec2-ul productnew-"
            u"header-pricec3-ul productnew-header-pricec2-cq']/li/@title")
        price_infos = list()
        for price_elem in price_elems:
            if unicode(price_elem) not in price_infos:
                price_infos.append(unicode(price_elem))
        price_info = u"/".join(price_infos)
        time_infos = []
        for date_elem in date_elems:
            time_infos.append(date_elem)
        time_info = u";".join(time_infos)
        url = task.request.url

        # 保存详情信息
        yield WebItem(url, telephone, description,
                      time_info, price_info, name)

Esempio n. 9

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

def _extract_tips(tree):
    """提取出小提示
        Args:
            tree: ElementTree
        Returns:
            tips: unicode
    """
    elem = flist(tree.xpath("//div[@id='mod-attention']/"
                            "article[@class='content-article']"), None)
    if elem is None:
        return u""
    else:
        texts = []
        for text_child in elem.itertext():
            texts.append(text_child)
        return u"".join(texts)

Esempio n. 10

0

Mostra file

File: parser.py Progetto: Yappawu/tigerspider

def _extract_tips(tree):
    """提取出小提示
        Args:
            tree: ElementTree
        Returns:
            tips: unicode
    """
    elem = flist(
        tree.xpath("//div[@id='mod-attention']/"
                   "article[@class='content-article']"), None)
    if elem is None:
        return u""
    else:
        texts = []
        for text_child in elem.itertext():
            texts.append(text_child)
        return u"".join(texts)

Esempio n. 11

0

Mostra file

File: parser.py Progetto: Yappawu/tigerspider

def _extract_play_spend(tree):
    """提取play spend和unit
    """
    play_spend = flist(
        tree.xpath("//div[@id='J-aside-info-recommend_visit_time']"
                   "/span[@class='val recommend_visit_time-value']/text()"),
        u"")
    if play_spend.find(u"天") != -1:
        play_spend_unit = u"天"
    elif play_spend.find(u"小时") != -1:
        play_spend_unit = u"小时"
    elif play_spend.find(u"分") != -1 or play_spend.find(u"分钟") != -1:
        play_spend_unit = u"分"
    elif len(play_spend.strip()) == 0:
        play_spend_unit = u""
    else:
        play_spend_unit = u""
    replace_dict = {
        u"小时": u"",
        u"天": u"",
        u"时": u"",
        u"小": u"",
        u"分钟": u"",
        u"分": u"",
        u"钟": u"",
        u"至": u"-",
        u"超过": u">",
        u"半": u".5",
        u"一": u"1",
        u"二": u"2",
        u"三": u"3",
        u"四": u"4",
        u"五": u"5",
        u"六": u"6",
        u"七": u"7",
        u"八": u"8",
        u"九": u"9",
    }

    for key, value in replace_dict.iteritems():
        play_spend = play_spend.replace(key, value)
    if play_spend.strip() == u".5":
        play_spend = u"0.5"

    return play_spend, play_spend_unit

Esempio n. 12

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

def _extract_preview(tree):
    """用于提取preview（针对两种类型的网页）
        Args:
            tree: ElementTree
        Returns:
            previe: unicode,
    """
    # 第一类型网页
    preview_elems = tree.xpath("//div[@class='sidebar-mod-inner']/"
                               "article/div[@class='']/p")
    preview_list = []
    for preview_elem in preview_elems:
        preview_list.append(preview_elem.text)
    preview = "".join(preview_list)

    # 第二类型网页
    if len(preview) == 0:
        preview = flist(tree.xpath("//div[@class='view-mod-desc-main']"
                                   "/div[@id='view-mod-abstract']/"
                                   "div[@class='desc-all-holder']"
                                   "/text()"), u"")
    return preview

Esempio n. 13

0

Mostra file

File: parser.py Progetto: Yappawu/tigerspider

def _extract_preview(tree):
    """用于提取preview（针对两种类型的网页）
        Args:
            tree: ElementTree
        Returns:
            previe: unicode,
    """
    # 第一类型网页
    preview_elems = tree.xpath("//div[@class='sidebar-mod-inner']/"
                               "article/div[@class='']/p")
    preview_list = []
    for preview_elem in preview_elems:
        preview_list.append(preview_elem.text)
    preview = "".join(preview_list)

    # 第二类型网页
    if len(preview) == 0:
        preview = flist(
            tree.xpath("//div[@class='view-mod-desc-main']"
                       "/div[@id='view-mod-abstract']/"
                       "div[@class='desc-all-holder']"
                       "/text()"), u"")
    return preview

Esempio n. 14

0

Mostra file

 def parse(self, task, input_file):
     """解析函数
         Args:
             task: HttpTask，任务
             input_file: StringIO, 网页文件
         Yields:
             task: HttpTask, 新任务
     """
     tree = html.parse(input_file)
     attraction_elems = tree.xpath(
         "//div[@id='ATTRACTION_OVERVIEW']"
         "/div[@class='attraction-list clearfix']")
     for attraction_elem in attraction_elems:
         try:
             info_elem = flist(
                 attraction_elem.xpath(
                     "div[@class='clearfix']/div[@class='info']"), None)
             rank_elem = flist(
                 attraction_elem.xpath(
                     "div[@class='clearfix']/div[@class='rank']"), None)
             relate_path = flist(
                 info_elem.xpath("div[@class='title']/a/@href"), u"")
             name = flist(info_elem.xpath("div[@class='title']/a/text()"),
                          u"")
             address = _extract_address(info_elem)
             hot = flist(rank_elem.xpath("a/strong/text()"), u"")
             rank = flist(rank_elem.xpath("span[1]/strong/text()"), u"")
             # 形成attraction 任务
             http_request = build_attraction_request(relate_path)
             attraction_task = HttpTask(http_request,
                                        callback="AttractionParser",
                                        max_fail_count=3,
                                        cookie_host=LVYOU_HOST,
                                        kwargs={
                                            "name": unicode(name).strip(),
                                            "address": unicode(address),
                                            "hot": unicode(hot),
                                            "rank": unicode(rank)
                                        })
             yield attraction_task
         except Exception, e:
             self.logger.warn("extract one attraction failed error:%s" % e)

Esempio n. 15

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

 def parse(self, task, input_file):
     """解析函数
         Args:
             task: HttpTask，任务
             input_file: StringIO, 网页文件
         Yields:
             task: HttpTask, 新任务
     """
     tree = html.parse(input_file)
     attraction_elems = tree.xpath("//div[@id='ATTRACTION_OVERVIEW']"
                                   "/div[@class='attraction-list clearfix']")
     for attraction_elem in attraction_elems:
         try:
             info_elem = flist(attraction_elem.xpath(
                 "div[@class='clearfix']/div[@class='info']"), None)
             rank_elem = flist(attraction_elem.xpath(
                 "div[@class='clearfix']/div[@class='rank']"), None)
             relate_path = flist(info_elem.xpath(
                 "div[@class='title']/a/@href"), u"")
             name = flist(info_elem.xpath(
                 "div[@class='title']/a/text()"), u"")
             address = _extract_address(info_elem)
             hot = flist(rank_elem.xpath("a/strong/text()"), u"")
             rank = flist(rank_elem.xpath("span[1]/strong/text()"), u"")
             # 形成attraction 任务
             http_request = build_attraction_request(relate_path)
             attraction_task = HttpTask(
                 http_request, callback="AttractionParser", max_fail_count=3,
                 cookie_host=LVYOU_HOST, kwargs={"name": unicode(name).strip(),
                                                 "address": unicode(
                                                     address),
                                                 "hot": unicode(hot),
                                                 "rank": unicode(rank)})
             yield attraction_task
         except Exception, e:
             self.logger.warn("extract one attraction failed error:%s" % e)

Esempio n. 16

0

Mostra file

    def parse(self, task, input_file):
        """解析函数
            Args:
                task: HttpTask，任务
                input_file: StringIO, 网页文件
            Yields:
                item
        """
        self.logger.debug("attraction parser start to parse")
        content = input_file.read()
        tree = html.parse(StringIO(content))
        try:
            zip_code = flist(tree.xpath("//span[@class='postal-code']/text()"),
                             u"")
            play_spend, play_spend_unit = _extract_play_spend_and_unit(content)
            tel_phone = flist(
                tree.xpath("//div[@id='HEADING_GROUP']"
                           "/div[@class='wrap infoBox']"
                           "/div[@class='odcHotel blDetails']"
                           "/div/div[@class='fl']/text()"), u"")
            open_time = u""
            total_score = flist(
                tree.xpath("//div[@class='rs rating']"
                           "/span/img/@content"))
            ticket_info = u""
            preview_relate_path = flist(
                tree.xpath("//div[@class='listing_description']/a/@href"), u"")
            lon, lat = _extract_lon_lat(
                flist(
                    tree.xpath("//div[@class='js_mapThumb']"
                               "/div[@id='bmapContainer']/img[1]/@src"), u""))
            comments = _extract_comments(tree)
            # 生成景点信息(不包括description)
            attraction_item = AttractionItem(task.request.url,
                                             task.kwargs['name'],
                                             unicode(play_spend),
                                             play_spend_unit,
                                             task.kwargs['address'],
                                             unicode(tel_phone),
                                             unicode(open_time),
                                             unicode(total_score),
                                             unicode(ticket_info),
                                             task.kwargs['hot'], lon, lat,
                                             task.kwargs['rank'], comments,
                                             unicode(zip_code))
            yield attraction_item

            # 生成description任务
            if len(preview_relate_path) != 0:
                description_request = build_description_request(
                    task.request.url, preview_relate_path)
                description_task = HttpTask(description_request,
                                            callback="DescriptionParser",
                                            max_fail_count=3,
                                            cookie_host=LVYOU_HOST,
                                            kwargs={'url': task.request.url})
                yield description_task
            else:
                yield DescriptionItem(task.request.url, u"")

        except Exception, e:
            print "error:%s" % e
            print traceback.format_exc()

Esempio n. 17

0

Mostra file

class AttractionListParser(BaseParser):
    """用于解析Attraction list的解析器
    """
    def __init__(self, namespace):
        BaseParser.__init__(self, namespace)
        self.logger.info("init attraction list parser finish")

    def parse(self, task, input_file):
        """解析函数
            Args:
                task: HttpTask，任务
                input_file: StringIO, 网页文件
            Yields:
                task: HttpTask, 新任务
        """
        tree = html.parse(input_file)
        attraction_elems = tree.xpath(
            "//div[@id='ATTRACTION_OVERVIEW']"
            "/div[@class='attraction-list clearfix']")
        for attraction_elem in attraction_elems:
            try:
                info_elem = flist(
                    attraction_elem.xpath(
                        "div[@class='clearfix']/div[@class='info']"), None)
                rank_elem = flist(
                    attraction_elem.xpath(
                        "div[@class='clearfix']/div[@class='rank']"), None)
                relate_path = flist(
                    info_elem.xpath("div[@class='title']/a/@href"), u"")
                name = flist(info_elem.xpath("div[@class='title']/a/text()"),
                             u"")
                address = _extract_address(info_elem)
                hot = flist(rank_elem.xpath("a/strong/text()"), u"")
                rank = flist(rank_elem.xpath("span[1]/strong/text()"), u"")
                # 形成attraction 任务
                http_request = build_attraction_request(relate_path)
                attraction_task = HttpTask(http_request,
                                           callback="AttractionParser",
                                           max_fail_count=3,
                                           cookie_host=LVYOU_HOST,
                                           kwargs={
                                               "name": unicode(name).strip(),
                                               "address": unicode(address),
                                               "hot": unicode(hot),
                                               "rank": unicode(rank)
                                           })
                yield attraction_task
            except Exception, e:
                self.logger.warn("extract one attraction failed error:%s" % e)
        # 形成下一页任务
        next_page_relate = flist(
            tree.xpath(
                "//div[@class='pagination']/div"
                "/a[@class='next sprite-arrow-right-green ml6 ']/@href"), u"")
        if len(next_page_relate) != 0:
            next_page_request = build_next_page_request(next_page_relate)
            next_page_task = HttpTask(next_page_request,
                                      callback="AttractionListParser",
                                      max_fail_count=5,
                                      cookie_host=LVYOU_HOST)
            yield next_page_task

Esempio n. 18

0

Mostra file

def _extract_comments(tree):
    """解析出评论
        Args:
            tree: ElementTree
        Returns:
            comments: list
    """
    comment_elems = tree.xpath(
        "//div[@id='REVIEWS']/div[@class='reviewSelector']")
    comments = []
    for comment_elem in comment_elems:
        try:
            # 第一类型网页
            comment_user = flist(
                comment_elem.xpath("div/div[@class='col1of2']"
                                   "//div[@class='username mo']/span/text()"),
                u"")
            comment_score = flist(
                comment_elem.xpath(
                    "div/div[@class='col2of2 ']"
                    "/div[@class='rating reviewItemInline']"
                    "/span[@class='rate rate_s s50']/img/@content"), u"")
            comment_time = flist(
                comment_elem.xpath("div/div[@class='col2of2 ']"
                                   "/div[@class='rating reviewItemInline']"
                                   "/span[@class='ratingDate']/text()"), u"")
            content_elems = comment_elem.xpath(
                "div/div[@class='col2of2 ']/div[@class='entry']/p")
            texts = []
            for content_elem in content_elems:
                if content_elem.text is not None:
                    texts.append(content_elem.text)
            comment_content = u"".join(texts).strip()

            # 第二类型网页
            if len(comment_user.strip()) == 0:
                comment_user = flist(
                    comment_elem.xpath("div/div[@class='col1of2']"
                                       "//div[@class='username']/span/text()"),
                    u"")
            if len(comment_time.strip()) == 0:
                comment_time = flist(
                    comment_elem.xpath("div/div[@class='col2of2']"
                                       "/span[@class='ratingDate']/text()"),
                    u"")
            if len(comment_content.strip()) == 0:
                content_elems = comment_elem.xpath("div/div[@class='col2of2']"
                                                   "/div[@class='entry']/p")
                texts = []
                for content_elem in content_elems:
                    if content_elem.text is not None:
                        texts.append(content_elem.text)
                comment_content = u"".join(texts).strip()

        except Exception, e:
            print "extract one comment failed error:%s" % e
            print traceback.format_exc()
        else:
            if len(unicode(comment_content)) != 0:
                comment_item = CommentItem(
                    unicode(comment_user).strip(),
                    unicode(comment_time).strip(),
                    unicode(comment_score).strip(),
                    unicode(comment_content).strip())
                comments.append(comment_item)

Esempio n. 19

0

Mostra file

File: parser.py Progetto: Yappawu/tigerspider

    def parse(self, task, input_file):
        """parse method

            Args:
                task: Task, task
                input_file: file: file with content
            Yields:
                item: Item, result of parse
                task: Task, new task
        """
        self.logger.debug("room parser begin to parse")
        try:
            try:
                soap_tree = etree.fromstring(input_file.read())
            except Exception, e:
                self.logger.error("not complete xml:%s" % e)
                raise ParserError("not complete xml")

            hotel_address_dict = task.kwargs.get('address')
            soap_elems = xpath_namespace(
                soap_tree, "/soap:Envelope/soap:Body/"
                "RequestResponse/RequestResult")
            xml_str = soap_elems[0].text

            tree = etree.fromstring(xml_str)
            elems = tree.xpath("/Response/Header")
            header = elems[0]

            if "ResultCode" not in header.attrib or \
                            header.attrib['ResultCode'] != "Success":
                self.logger.error("not has resultcode or "
                                  "resultcode is not success")
                raise ParserError("ResultCode error")
            else:
                content_elems = xpath_namespace(
                    tree, "/Response/HotelResponse/"
                    "OTA_HotelDescriptiveInfoRS/"
                    "HotelDescriptiveContents/"
                    "HotelDescriptiveContent")
                for content_elem in content_elems:
                    item_hotel_code = None
                    item_hotel_city_code = task.kwargs.get('citycode')
                    try:
                        item_hotel_code = content_elem.attrib.get('HotelCode')
                        item_hotel_name = content_elem.attrib.get('HotelName')
                        item_hotel_brand_id = \
                            content_elem.attrib.get('BrandCode')

                        position_elem = flist(
                            xpath_namespace(content_elem,
                                            "HotelInfo//Position"), None)
                        item_hotel_latitude = "" if position_elem is None \
                            or "Latitude" not in position_elem.attrib \
                            else position_elem.attrib.get('Latitude')
                        item_hotel_longitude = "" if position_elem is None \
                            or "Longitude" not in position_elem.attrib \
                            else position_elem.attrib.get('Longitude')

                        service_elems = xpath_namespace(
                            content_elem, "HotelInfo/Services/Service")

                        item_hotel_service = u"、".join([
                            flist(
                                service.xpath(
                                    "*[local-name()='DescriptiveText']/text()")
                            ) for service in service_elems
                            if "Code" in service.attrib
                            and service.attrib["Code"] in HOTEL_SERVICE_CODES
                        ])

                        item_room_service = u"、".join([
                            flist(
                                service.xpath(
                                    "*[local-name()='DescriptiveText']/text()")
                            ) for service in service_elems
                            if "Code" in service.attrib
                            and service.attrib["Code"] in ROOM_SERVICE_CODES
                        ])

                        awards_elem = flist(
                            xpath_namespace(content_elem,
                                            "AffiliationInfo/Awards"), None)
                        item_hotel_star, item_hotel_rate = ("", "") \
                            if awards_elem is None else \
                            (flist(awards_elem.xpath(
                                "*[local-name()='Award' and "
                                "@Provider='HotelStarRate']/@Rating")),
                             flist(awards_elem.xpath(
                                 "*[local-name()='Award' and "
                                 "@Provider='CtripStarRate']/@Rating")))

                        multimedia_elem = flist(
                            xpath_namespace(content_elem,
                                            "MultimediaDescriptions"), None)
                        image_elems = [] if multimedia_elem is None \
                            else xpath_namespace(
                            multimedia_elem,
                            "MultimediaDescription/ImageItems/ImageItem")

                        item_image_list = []
                        for image_elem in image_elems:
                            image_url = flist(
                                image_elem.xpath(
                                    "*[local-name()='ImageFormat']/"
                                    "*[local-name()='URL']/text()"))
                            image_type = flist(image_elem.xpath("@Category"))
                            if not image_url and not image_type:
                                continue
                            image_text = flist(
                                image_elem.xpath(
                                    "*[local-name()='Description']/@Caption"))
                            item_image_dict = {
                                "image_url": image_url,
                                "image_type": image_type,
                                "image_text": image_text.encode('utf-8')
                            }
                            item_image_list.append(item_image_dict)

                            if item_hotel_code and image_url:
                                image_item = ImageItem(item_hotel_code,
                                                       str(image_type),
                                                       unicode(image_text),
                                                       str(image_url))
                                yield image_item

                        text_items_elem = flist(
                            xpath_namespace(multimedia_elem,
                                            "MultimediaDescription/TextItems"),
                            None)
                        item_hotel_preview = "" if text_items_elem is None \
                            else flist(text_items_elem.xpath(
                            "*[local-name()='TextItem' and @Category='5']/"
                            "*[local-name()='Description']/text()"))

                        room_elems = xpath_namespace(
                            content_elem, "FacilityInfo/"
                            "GuestRooms/GuestRoom")
                        item_room_list = []
                        for room_elem in room_elems:
                            room_info_id = flist(
                                room_elem.xpath(
                                    "*[local-name()='TypeRoom']/@RoomTypeCode")
                            )
                            room_info_name = flist(
                                room_elem.xpath("@RoomTypeName"))
                            room_bed_type = flist(
                                room_elem.xpath(
                                    "*[local-name()='TypeRoom']/@BedTypeCode"))
                            room_net_service, room_net_service_fee = \
                                _extract_net_service(room_elem)
                            room_info_rate_price = ""
                            room_hot = ""
                            room_floor = flist(
                                room_elem.xpath(
                                    "*[local-name()='TypeRoom']/@Floor"))
                            room_breakfast = ""
                            room_area = ""
                            room_info_dict = {
                                'roomInfo_id': room_info_id,
                                'roomInfo_ratePrice': room_info_rate_price,
                                'hot': room_hot
                            }
                            if room_info_id and room_info_name and \
                                    item_hotel_code:
                                item_room_list.append(room_info_dict)

                                room_item = RoomInfoItem(
                                    item_hotel_code, str(room_info_id),
                                    room_info_name, room_floor,
                                    room_net_service, room_net_service_fee,
                                    room_bed_type, room_breakfast, room_area)
                                yield room_item

                        item_hotel_address = "" if item_hotel_code not in \
                                                   hotel_address_dict \
                            else hotel_address_dict.get(item_hotel_code)
                        hotel_item = HotelInfoItem(
                            item_hotel_code, item_hotel_city_code,
                            item_hotel_name, item_hotel_brand_id,
                            item_hotel_latitude, item_hotel_longitude,
                            item_hotel_service, item_room_service,
                            item_hotel_star, item_hotel_rate, item_image_list,
                            item_hotel_preview, item_room_list,
                            item_hotel_address)

                        yield hotel_item
                    except Exception, e:
                        self.logger.warn("one hotel extract error:%s" % e)
                        if item_hotel_code is None:
                            self.logger.error("i am sorry, i can do noting")
                        else:
                            chinese_name = task.kwargs.get('chinesename')
                            yield build_rooms_task_for_hotel(
                                [item_hotel_code], item_hotel_city_code,
                                chinese_name, hotel_address_dict)

Esempio n. 20

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

    def parse(self, task, input_file):
        """parse response result

            Args:
                task: FileTask or HttpTask
                input_file: file or StringIO
            Yields:
                item: Item, result of parse
                task: Task, new task
        """
        self.logger.info("hotel parser begin to parse")
        try:
            try:
                soap_tree = etree.fromstring(input_file.read())
            except Exception, e:
                self.logger.error("not complete xml:%s" % e)
                raise ParserError("not complete xml")

            soap_elems = xpath_namespace(soap_tree,
                                         "/soap:Envelope/soap:Body/"
                                         "RequestResponse/RequestResult")
            xml_str = soap_elems[0].text
            tree = etree.fromstring(xml_str)
            elems = tree.xpath("/Response/Header")
            header = elems[0]
            if "ResultCode" not in header.attrib or \
                            header.attrib['ResultCode'] != "Success":
                self.logger.error("not has resultcode or "
                                  "resultcode is not success")
                raise ParserError("ResultCode error")
            else:
                # success
                property_elems = xpath_namespace(
                    tree, "/Response/HotelResponse/OTA_HotelSearchRS/"
                          "Properties/Property")
                city_code = task.kwargs.get('citycode')
                chinese_name = task.kwargs.get('chinesename')

                hotel_requests = list()
                hotel_addresses = dict()
                for property_elem in property_elems:
                    hotel_code = str(property_elem.attrib['HotelCode']) \
                        if "HotelCode" in property_elem.attrib \
                        else None
                    hotel_ctrip_city_code = str(
                        property_elem.attrib['HotelCityCode']) \
                        if "HotelCityCode" in property_elem.attrib else None
                    hotel_address = flist(property_elem.xpath(
                        "*[local-name()='Address']/"
                        "*[local-name()='AddressLine']/text()"))
                    if isinstance(hotel_address, unicode):
                        hotel_address = hotel_address.encode("utf-8")
                    hotel_address = str(hotel_address)

                    if hotel_code and hotel_ctrip_city_code:
                        hotel_url = build_hotel_url(hotel_code)
                        yield HotelCodeItem(hotel_code, city_code, hotel_url)

                        hotel_requests.append(hotel_code)
                        hotel_addresses[hotel_code] = hotel_address
                        if len(hotel_requests) >= self.batch_count:
                            yield build_rooms_task_for_hotel(hotel_requests,
                                                             city_code,
                                                             chinese_name,
                                                             hotel_addresses)
                            hotel_addresses.clear()
                            del hotel_requests[:]

                # send left requests
                if len(hotel_requests) > 0:
                    yield build_rooms_task_for_hotel(hotel_requests, city_code,
                                                     chinese_name,
                                                     hotel_addresses)
                    hotel_addresses.clear()
                    del hotel_requests[:]

Esempio n. 21

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

    def parse(self, task, input_file):
        """解析函数
            Args:
                task: HTTPTask, 任务
                input_file: StringIO, 网页信息
            Yields:
                task: HTTPTask, 任务
                item: Item, 解析的结果
        """
        self.logger.debug("attraction parser start to parse")
        parser = html.HTMLParser(encoding='utf-8')
        tree = html.parse(input_file, parser)
        try:
            name = flist(tree.xpath("//header[@class='title-head']/a/p/text()"), u"")
            play_spend, play_spend_unit = _extract_play_spend(tree)
            address = flist(tree.xpath("//div[@id='J-aside-info-address']"
                                       "/span[@class='val address-value']"
                                       "/text()"), u"")
            tel_phone = flist(tree.xpath("//div[@id='J-aside-info-phone']"
                                         "/span[@class='val phone-value']"
                                         "/text()"), u"")
            time_elems = tree.xpath("//div[@id='J-aside-info-opening_hours']"
                                    "/div[@class='val opening_hours-value']/p")
            time_list = []
            for time_elem in time_elems:
                time_list.append(time_elem.text)
            open_time = "".join(time_list)
            total_score = flist(tree.xpath("//div[@class='scene-rating']"
                                           "/div/@content"), u"")
            ticket_info = flist(tree.xpath("//div[@id='J-aside-info-price']"
                                           "/div[@class='val price-value']"
                                           "/p/text()"), u"")
            preview = _extract_preview(tree)
            traffic = _extract_traffic(tree)
            tips = _extract_tips(tree)
            hot = flist(tree.xpath("//section[@id='remark-container']"
                                   "/div[@class='remark-overall-rating']"
                                   "/span[@class='remark-all-counts']"
                                   "/text()"), u"")
            lon_lat = task.kwargs['map_info'].split(",")
            if len(lon_lat) <= 1:
                lon, lat = u"", u""
            else:
                lon, lat = lon_lat[0], lon_lat[1]
            seq_sort = task.kwargs['seq_sort']
            sid = task.kwargs['sid']
            attraction_item = AttractionItem(unicode(sid), unicode(name),
                                             unicode(play_spend),
                                             unicode(play_spend_unit),
                                             unicode(address),
                                             unicode(tel_phone),
                                             unicode(open_time),
                                             unicode(total_score),
                                             unicode(ticket_info),
                                             unicode(preview), unicode(hot),
                                             unicode(lon), unicode(lat),
                                             unicode(seq_sort),
                                             unicode(traffic), unicode(tips))
            yield attraction_item

            # yield comment list task
            comments_request = build_comment_list_request(
                sid, task.kwargs['relate_path'])
            comments_task = HttpTask(
                comments_request, callback="CommentListParser",
                max_fail_count=3,
                cookie_host=LVYOU_HOST, kwargs={'sid': sid})
            yield comments_task
        except Exception, e:
            self.logger.error("extract Attraction failed error:%s" % e)
            self.logger.error("error traceback:%s" % traceback.format_exc())
            raise e

Esempio n. 22

0

Mostra file

File: parser.py Progetto: Yappawu/tigerspider

    def parse(self, task, input_file):
        """parse response result

            Args:
                task: FileTask or HttpTask
                input_file: file or StringIO
            Yields:
                item: Item, result of parse
                task: Task, new task
        """
        self.logger.info("hotel parser begin to parse")
        try:
            try:
                soap_tree = etree.fromstring(input_file.read())
            except Exception, e:
                self.logger.error("not complete xml:%s" % e)
                raise ParserError("not complete xml")

            soap_elems = xpath_namespace(
                soap_tree, "/soap:Envelope/soap:Body/"
                "RequestResponse/RequestResult")
            xml_str = soap_elems[0].text
            tree = etree.fromstring(xml_str)
            elems = tree.xpath("/Response/Header")
            header = elems[0]
            if "ResultCode" not in header.attrib or \
                            header.attrib['ResultCode'] != "Success":
                self.logger.error("not has resultcode or "
                                  "resultcode is not success")
                raise ParserError("ResultCode error")
            else:
                # success
                property_elems = xpath_namespace(
                    tree, "/Response/HotelResponse/OTA_HotelSearchRS/"
                    "Properties/Property")
                city_code = task.kwargs.get('citycode')
                chinese_name = task.kwargs.get('chinesename')

                hotel_requests = list()
                hotel_addresses = dict()
                for property_elem in property_elems:
                    hotel_code = str(property_elem.attrib['HotelCode']) \
                        if "HotelCode" in property_elem.attrib \
                        else None
                    hotel_ctrip_city_code = str(
                        property_elem.attrib['HotelCityCode']) \
                        if "HotelCityCode" in property_elem.attrib else None
                    hotel_address = flist(
                        property_elem.xpath(
                            "*[local-name()='Address']/"
                            "*[local-name()='AddressLine']/text()"))
                    if isinstance(hotel_address, unicode):
                        hotel_address = hotel_address.encode("utf-8")
                    hotel_address = str(hotel_address)

                    if hotel_code and hotel_ctrip_city_code:
                        hotel_url = build_hotel_url(hotel_code)
                        yield HotelCodeItem(hotel_code, city_code, hotel_url)

                        hotel_requests.append(hotel_code)
                        hotel_addresses[hotel_code] = hotel_address
                        if len(hotel_requests) >= self.batch_count:
                            yield build_rooms_task_for_hotel(
                                hotel_requests, city_code, chinese_name,
                                hotel_addresses)
                            hotel_addresses.clear()
                            del hotel_requests[:]

                # send left requests
                if len(hotel_requests) > 0:
                    yield build_rooms_task_for_hotel(hotel_requests, city_code,
                                                     chinese_name,
                                                     hotel_addresses)
                    hotel_addresses.clear()
                    del hotel_requests[:]

Esempio n. 23

0

Mostra file

File: parser.py Progetto: Yappawu/tigerspider

    def parse(self, task, input_file):
        """解析函数
            Args:
                task: HTTPTask, 任务
                input_file: StringIO, 网页信息
            Yields:
                task: HTTPTask, 任务
                item: Item, 解析的结果
        """
        self.logger.debug("attraction parser start to parse")
        parser = html.HTMLParser(encoding='utf-8')
        tree = html.parse(input_file, parser)
        try:
            name = flist(
                tree.xpath("//header[@class='title-head']/a/p/text()"), u"")
            play_spend, play_spend_unit = _extract_play_spend(tree)
            address = flist(
                tree.xpath("//div[@id='J-aside-info-address']"
                           "/span[@class='val address-value']"
                           "/text()"), u"")
            tel_phone = flist(
                tree.xpath("//div[@id='J-aside-info-phone']"
                           "/span[@class='val phone-value']"
                           "/text()"), u"")
            time_elems = tree.xpath("//div[@id='J-aside-info-opening_hours']"
                                    "/div[@class='val opening_hours-value']/p")
            time_list = []
            for time_elem in time_elems:
                time_list.append(time_elem.text)
            open_time = "".join(time_list)
            total_score = flist(
                tree.xpath("//div[@class='scene-rating']"
                           "/div/@content"), u"")
            ticket_info = flist(
                tree.xpath("//div[@id='J-aside-info-price']"
                           "/div[@class='val price-value']"
                           "/p/text()"), u"")
            preview = _extract_preview(tree)
            traffic = _extract_traffic(tree)
            tips = _extract_tips(tree)
            hot = flist(
                tree.xpath("//section[@id='remark-container']"
                           "/div[@class='remark-overall-rating']"
                           "/span[@class='remark-all-counts']"
                           "/text()"), u"")
            lon_lat = task.kwargs['map_info'].split(",")
            if len(lon_lat) <= 1:
                lon, lat = u"", u""
            else:
                lon, lat = lon_lat[0], lon_lat[1]
            seq_sort = task.kwargs['seq_sort']
            sid = task.kwargs['sid']
            attraction_item = AttractionItem(unicode(sid), unicode(name),
                                             unicode(play_spend),
                                             unicode(play_spend_unit),
                                             unicode(address),
                                             unicode(tel_phone),
                                             unicode(open_time),
                                             unicode(total_score),
                                             unicode(ticket_info),
                                             unicode(preview), unicode(hot),
                                             unicode(lon), unicode(lat),
                                             unicode(seq_sort),
                                             unicode(traffic), unicode(tips))
            yield attraction_item

            # yield comment list task
            comments_request = build_comment_list_request(
                sid, task.kwargs['relate_path'])
            comments_task = HttpTask(comments_request,
                                     callback="CommentListParser",
                                     max_fail_count=3,
                                     cookie_host=LVYOU_HOST,
                                     kwargs={'sid': sid})
            yield comments_task
        except Exception, e:
            self.logger.error("extract Attraction failed error:%s" % e)
            self.logger.error("error traceback:%s" % traceback.format_exc())
            raise e

Esempio n. 24

0

Mostra file

File: parser.py Progetto: JobsDong/tigerspider

    def parse(self, task, input_file):
        """parse method

            Args:
                task: Task, task
                input_file: file: file with content
            Yields:
                item: Item, result of parse
                task: Task, new task
        """
        self.logger.debug("room parser begin to parse")
        try:
            try:
                soap_tree = etree.fromstring(input_file.read())
            except Exception, e:
                self.logger.error("not complete xml:%s" % e)
                raise ParserError("not complete xml")

            hotel_address_dict = task.kwargs.get('address')
            soap_elems = xpath_namespace(soap_tree,
                                         "/soap:Envelope/soap:Body/"
                                         "RequestResponse/RequestResult")
            xml_str = soap_elems[0].text

            tree = etree.fromstring(xml_str)
            elems = tree.xpath("/Response/Header")
            header = elems[0]

            if "ResultCode" not in header.attrib or \
                            header.attrib['ResultCode'] != "Success":
                self.logger.error("not has resultcode or "
                                  "resultcode is not success")
                raise ParserError("ResultCode error")
            else:
                content_elems = xpath_namespace(tree,
                                                "/Response/HotelResponse/"
                                                "OTA_HotelDescriptiveInfoRS/"
                                                "HotelDescriptiveContents/"
                                                "HotelDescriptiveContent")
                for content_elem in content_elems:
                    item_hotel_code = None
                    item_hotel_city_code = task.kwargs.get('citycode')
                    try:
                        item_hotel_code = content_elem.attrib.get('HotelCode')
                        item_hotel_name = content_elem.attrib.get('HotelName')
                        item_hotel_brand_id = \
                            content_elem.attrib.get('BrandCode')

                        position_elem = flist(
                            xpath_namespace(content_elem,
                                            "HotelInfo//Position"), None)
                        item_hotel_latitude = "" if position_elem is None \
                            or "Latitude" not in position_elem.attrib \
                            else position_elem.attrib.get('Latitude')
                        item_hotel_longitude = "" if position_elem is None \
                            or "Longitude" not in position_elem.attrib \
                            else position_elem.attrib.get('Longitude')

                        service_elems = xpath_namespace(
                            content_elem, "HotelInfo/Services/Service")

                        item_hotel_service = u"、".join(
                            [flist(service.xpath(
                                "*[local-name()='DescriptiveText']/text()"))
                             for service in service_elems
                             if "Code" in service.attrib and
                                service.attrib["Code"] in HOTEL_SERVICE_CODES])

                        item_room_service = u"、".join(
                            [flist(service.xpath(
                                "*[local-name()='DescriptiveText']/text()"))
                             for service in service_elems
                             if "Code" in service.attrib and
                                service.attrib["Code"] in ROOM_SERVICE_CODES])

                        awards_elem = flist(
                            xpath_namespace(content_elem,
                                            "AffiliationInfo/Awards"),
                            None)
                        item_hotel_star, item_hotel_rate = ("", "") \
                            if awards_elem is None else \
                            (flist(awards_elem.xpath(
                                "*[local-name()='Award' and "
                                "@Provider='HotelStarRate']/@Rating")),
                             flist(awards_elem.xpath(
                                 "*[local-name()='Award' and "
                                 "@Provider='CtripStarRate']/@Rating")))

                        multimedia_elem = flist(
                            xpath_namespace(content_elem,
                                            "MultimediaDescriptions"), None)
                        image_elems = [] if multimedia_elem is None \
                            else xpath_namespace(
                            multimedia_elem,
                            "MultimediaDescription/ImageItems/ImageItem")

                        item_image_list = []
                        for image_elem in image_elems:
                            image_url = flist(image_elem.xpath(
                                "*[local-name()='ImageFormat']/"
                                "*[local-name()='URL']/text()"))
                            image_type = flist(image_elem.xpath("@Category"))
                            if not image_url and not image_type:
                                continue
                            image_text = flist(image_elem.xpath(
                                "*[local-name()='Description']/@Caption"))
                            item_image_dict = {"image_url": image_url,
                                               "image_type": image_type,
                                               "image_text":
                                                   image_text.encode('utf-8')}
                            item_image_list.append(item_image_dict)

                            if item_hotel_code and image_url:
                                image_item = ImageItem(item_hotel_code,
                                                       str(image_type),
                                                       unicode(image_text),
                                                       str(image_url))
                                yield image_item

                        text_items_elem = flist(xpath_namespace(
                            multimedia_elem, "MultimediaDescription/TextItems"),
                                                None)
                        item_hotel_preview = "" if text_items_elem is None \
                            else flist(text_items_elem.xpath(
                            "*[local-name()='TextItem' and @Category='5']/"
                            "*[local-name()='Description']/text()"))

                        room_elems = xpath_namespace(content_elem,
                                                     "FacilityInfo/"
                                                     "GuestRooms/GuestRoom")
                        item_room_list = []
                        for room_elem in room_elems:
                            room_info_id = flist(room_elem.xpath(
                                "*[local-name()='TypeRoom']/@RoomTypeCode"))
                            room_info_name = flist(room_elem.xpath(
                                "@RoomTypeName"))
                            room_bed_type = flist(room_elem.xpath(
                                "*[local-name()='TypeRoom']/@BedTypeCode"))
                            room_net_service, room_net_service_fee = \
                                _extract_net_service(room_elem)
                            room_info_rate_price = ""
                            room_hot = ""
                            room_floor = flist(room_elem.xpath(
                                "*[local-name()='TypeRoom']/@Floor"))
                            room_breakfast = ""
                            room_area = ""
                            room_info_dict = {'roomInfo_id': room_info_id,
                                              'roomInfo_ratePrice':
                                                  room_info_rate_price,
                                              'hot': room_hot}
                            if room_info_id and room_info_name and \
                                    item_hotel_code:
                                item_room_list.append(room_info_dict)

                                room_item = RoomInfoItem(item_hotel_code,
                                                         str(room_info_id),
                                                         room_info_name,
                                                         room_floor,
                                                         room_net_service,
                                                         room_net_service_fee,
                                                         room_bed_type,
                                                         room_breakfast,
                                                         room_area)
                                yield room_item

                        item_hotel_address = "" if item_hotel_code not in \
                                                   hotel_address_dict \
                            else hotel_address_dict.get(item_hotel_code)
                        hotel_item = HotelInfoItem(item_hotel_code,
                                                   item_hotel_city_code,
                                                   item_hotel_name,
                                                   item_hotel_brand_id,
                                                   item_hotel_latitude,
                                                   item_hotel_longitude,
                                                   item_hotel_service,
                                                   item_room_service,
                                                   item_hotel_star,
                                                   item_hotel_rate,
                                                   item_image_list,
                                                   item_hotel_preview,
                                                   item_room_list,
                                                   item_hotel_address)

                        yield hotel_item
                    except Exception, e:
                        self.logger.warn("one hotel extract error:%s" % e)
                        if item_hotel_code is None:
                            self.logger.error("i am sorry, i can do noting")
                        else:
                            chinese_name = task.kwargs.get('chinesename')
                            yield build_rooms_task_for_hotel(
                                [item_hotel_code], item_hotel_city_code,
                                chinese_name, hotel_address_dict)