Python HttpTask Beispiele, core.datastruct.HttpTask Python Beispiele

Beispiel #1

0

Datei anzeigen

Datei: parser.py Projekt: Yappawu/tigerspider

    def parse(self, task, input_file):
        """解析函数
            Args:
                task: HttpTask, 请求任务
                input_file: StringIO, 网页StringIO
            Yields:
                task: HttpTask, 任务
                item: Item, 解析结果
        """
        self.logger.debug("start parse attraction list page")
        try:
            json_data = json.load(input_file)
            city_name = json_data['data']['surl']
            current_page = int(json_data['data']['current_page'])
            scene_list = json_data['data']['scene_list']
            total_scene = int(json_data['data']['scene_total'])
            for index, scene in enumerate(scene_list):
                relate_path = scene['surl']
                sid = scene['sid']
                map_info = scene['ext']['map_info']
                seq_sort = \
                    (current_page - 1) * EVERY_PAGE_SCENE_COUNT + index + 1
                # 生成景点request
                http_request = HTTPRequest(build_scene_url(relate_path),
                                           connect_timeout=5,
                                           request_timeout=10)
                scene_task = HttpTask(http_request,
                                      callback="AttractionParser",
                                      max_fail_count=3,
                                      cookie_host=LVYOU_HOST,
                                      kwargs={
                                          "map_info": map_info,
                                          "seq_sort": seq_sort,
                                          "sid": sid,
                                          "relate_path": relate_path
                                      })
                yield scene_task

            # 生成 下一页任务
            if current_page * EVERY_PAGE_SCENE_COUNT < total_scene:
                # 有下一页, 生成下一个request请求
                next_request = build_next_page_request(city_name,
                                                       current_page + 1)
                next_page_task = HttpTask(next_request,
                                          callback="AttractionListParser",
                                          max_fail_count=5,
                                          cookie_host=LVYOU_HOST)
                yield next_page_task

        except Exception, e:
            self.logger.info("json dumps error:%s for url:%s" %
                             (e, task.request.url))
            raise e

Beispiel #2

0

Datei anzeigen

Datei: parser.py Projekt: Yappawu/tigerspider

    def _check_and_execute_picture(self, picture_url, cookie_host,
                                   cookie_count):
        """檢查圖片是否存在，並且生成task和改造path
            Args:
                picture_url, str, 圖片的url
            Returns: 二元組, (picture_path, task)
        """
        picture_path = u""
        if picture_url:
            picture_path = picture_url.replace(u"http://", self._picture_host)\
                .replace(u"\\s+|", "")\
                .replace(u"\\.jpg\\\\.*$", u".jpg")\
                .lower()

        if len(picture_path) > 0 and not os.path.exists(self._picture_dir +
                                                        picture_path):
            picture_request = HTTPRequest(url=str(picture_url),
                                          connect_timeout=10,
                                          request_timeout=60)
            picture_task = HttpTask(
                picture_request,
                callback='PictureParser',
                cookie_host=cookie_host,
                cookie_count=cookie_count,
                max_fail_count=2,
                kwargs={'picturepath': self._picture_dir + picture_path})
            return picture_path, picture_task
        else:
            return picture_path, None

Beispiel #3

0

Datei anzeigen

def create_city_type_task(city_name, city_code, abbreviation,
                          _type, tag, page=1, j=1):
    """根据参数构建CityTypeTask
        Args:
            city_name: str, 城市中文名
            city_code: int, 城市code
            abbreviation: str, 城市拼音缩写
            _type: str, 类型名
            tag: str, 标签
            page: int, 页码
            j: int, 常数
        Returns:
            task: HttpTask, 任务
    """
    url = "http://www.228.com.cn/s/%s-%s/?j=%s&p=%s" % (
        abbreviation, _type, j, page)
    cookie_host = "http://www.228.com.cn/%s/" % abbreviation
    http_request = HTTPRequest(url=url, connect_timeout=10, request_timeout=25)
    task = HttpTask(http_request, callback="DealParser", max_fail_count=8,
                    cookie_host=cookie_host, cookie_count=20,
                    kwargs={'type': _type,
                            'abbreviation': abbreviation,
                            'city_code': city_code,
                            'city_name': city_name,
                            'tag': tag,
                            'current_page': page,
                            'cookie_host': cookie_host,
                            'cookie_count': 20})
    return task

Beispiel #4

0

Datei anzeigen

    def _check_and_execute_picture(self, picture_url):
        """检查图片信息，生成图片task
            Args:
                picture_url: str, 图片链接
            Returns:
                (picture_path, task): 二元组，（路径，任务）
        """
        pictures = []
        if picture_url:
            picture_path = picture_url.replace(u"http://", self._picture_host)\
                .replace(u"\\s+|", "")\
                .replace(u"\\.jpg\\\\.*$", u".jpg")\
                .lower()
            pictures.append(picture_path)

        if len(pictures) >= 1 and not os.path.exists(self._picture_dir +
                                                     pictures[0]):
            picture_request = HTTPRequest(url=str(picture_url),
                                          connect_timeout=10,
                                          request_timeout=40)
            picture_task = HttpTask(
                picture_request,
                callback='PictureParser',
                cookie_host='http://www.nuomi.com',
                cookie_count=15,
                kwargs={'picturepath': self._picture_dir + pictures[0]})
            return pictures, picture_task
        else:
            return pictures, None

Beispiel #5

0

Datei anzeigen

 def parse(self, task, input_file):
     """解析函数
         Args:
             task:Task, 任务描述
             input_file:File, 文件对象
         Yields:
             Item
             Task
     """
     tree = html.parse(input_file)
     citys = tree.xpath("//p[@id='citypid']/text()")
     citys = citys[0] if citys is not None and len(citys) > 0 else ""
     for city in citys.split(u","):
         city_english_name = remove_white(city)
         if len(city_english_name) > 0:
             city_item = CityItem("", city_english_name,
                                  get_city_code(city_english_name))
             if city_item.english_name and city_item.city_code:
                 yield city_item
                 http_request = HTTPRequest(url=build_url_by_city_name(
                     city_item.english_name),
                                            connect_timeout=20,
                                            request_timeout=240)
                 new_task = HttpTask(
                     http_request,
                     callback='DealParser',
                     max_fail_count=5,
                     kwargs={'citycode': city_item.city_code})
                 yield new_task

Beispiel #6

0

Datei anzeigen

Datei: parser.py Projekt: Yappawu/tigerspider

    def parse(self, task, input_file):
        """用于解析列表页面(json数据)，
            Args:
                task: HttpTask, 任务对象
                input_file: File, 文件对象
            Yields:
                item: Item, 提取的对象
                task: 新的Task
        """
        # 获取json数据
        self.logger.info("deal parser start to handle")
        json_data = json.load(input_file)
        elems = json_data.get('products')
        page_size = json_data.get('pageSize', 1)
        # 获取传递的参数
        city_name = task.kwargs.get('city_name')
        tag = task.kwargs.get('tag')
        current_page = task.kwargs.get('current_page')
        city_code = task.kwargs.get('city_code')
        _type = task.kwargs.get('type')
        abbreviation = task.kwargs.get('abbreviation')
        cookie_host = task.kwargs.get('cookie_host')
        cookie_count = task.kwargs.get('cookie_count')
        if elems is not None:
            for elem in elems:
                try:
                    url, name, start_time, end_time, place_name = \
                        _extract_elem(elem)

                    # 存储Activity Item
                    yield ActivityItem(name, url, start_time, end_time,
                                       place_name, tag, city_code)
                    # 发起item请求
                    request = HTTPRequest(url,
                                          connect_timeout=10,
                                          request_timeout=15)
                    task = HttpTask(request,
                                    callback="ActivityParser",
                                    cookie_host=cookie_host,
                                    cookie_count=cookie_count,
                                    max_fail_count=3,
                                    kwargs={
                                        "url": url,
                                        "cookie_host": cookie_host,
                                        "cookie_count": cookie_count
                                    })
                    yield task
                except Exception, e:
                    self.logger.warn("extract one element failed error:%s" % e)

            # 发起下一页请求
            if current_page < int(page_size):
                next_page_task = create_city_type_task(city_name,
                                                       city_code,
                                                       abbreviation,
                                                       _type,
                                                       tag,
                                                       page=current_page + 1)
                yield next_page_task

Beispiel #7

0

Datei anzeigen

def build_hotels_task_for_city(ctrip_code,
                               city_code,
                               chinese_name,
                               avaliable="false"):
    """build task for hotel search

        Args:
            ctrip_code: str, city code for ctrip
            city_code: str, city code of tigerknows
            chinese_name: str, chinese name of city
        Returns:
            task: HttpTask, new task
    """
    timestamp = int(time.time())
    request_xml = """<?xml version="1.0" encoding="utf-8"?>
    <Request><Header  AllianceID="%s" SID="%s" TimeStamp="%s"
     RequestType="%s" Signature="%s" /><HotelRequest>
    <RequestBody xmlns:ns="http://www.opentravel.org/OTA/2003/05"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xmlns:xsd="http://www.w3.org/2001/XMLSchema">
    <ns:OTA_HotelSearchRQ Version="1.0" PrimaryLangID="zh"
    xsi:schemaLocation="http://www.opentravel.org/OTA/2003/05 OTA_HotelSearchRQ.xsd"
    xmlns="http://www.opentravel.org/OTA/2003/05">
    <ns:Criteria AvailableOnlyIndicator="%s"><ns:Criterion>
    <ns:HotelRef HotelCityCode="%s"/>
    <ns:Position PositionTypeCode="502" />
    </ns:Criterion></ns:Criteria></ns:OTA_HotelSearchRQ>
    </RequestBody></HotelRequest></Request>""" \
    % (ALLIANCE_ID, SID, timestamp, "OTA_HotelSearch",
       _create_signature(timestamp, ALLIANCE_ID, SID, "OTA_HotelSearch", API_KEY),
       avaliable, ctrip_code,)

    post_xml = """<?xml version="1.0" encoding="utf-8"?>
    <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xmlns:xsd="http://www.w3.org/2001/XMLSchema"
    xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
    <soap:Body><Request xmlns="http://ctrip.com/">
    <requestXML>%s</requestXML></Request></soap:Body></soap:Envelope>""" \
               % escape(request_xml)

    http_request = HTTPRequest("http://%s/Hotel/OTA_HotelSearch.asmx" %
                               API_URL,
                               method="POST",
                               body=post_xml,
                               connect_timeout=20,
                               request_timeout=240,
                               headers={
                                   "SOAPAction": "http://ctrip.com/Request",
                                   "Content-Type": "text/xml; charset=utf-8"
                               })

    return HttpTask(http_request,
                    callback="HotelListParser",
                    max_fail_count=5,
                    kwargs={
                        "citycode": city_code,
                        "chinesename": chinese_name
                    })

Beispiel #8

0

Datei anzeigen

Datei: spider.py Projekt: Yappawu/tigerspider

class Intro1Spider(BaseSpider):

    parsers = {
        u"ActivityParser": ActivityParser,
    }

    pipelines = {
        u"WebItem": WebItemPipeline,
    }

    start_tasks = [HttpTask(HTTPRequest(u"http://www.228.com.cn/ticket-49052202.html"),
                            callback=u"ActivityParser")]

Beispiel #9

0

Datei anzeigen

def build_tag_tasks():
    """生成所有的tag任务
        Returns:
            tasks: list, [HttpTask]
    """
    tasks = []
    for key, value in TAGS.iteritems():
        task = HttpTask(build_next_tag_page_request(key, 1, "shanghai"),
                        callback="TagListParser", max_fail_count=5,
                        cookie_host=LVYOU_HOST, kwargs={"tag": value})
        tasks.append(task)
    return tasks

Beispiel #10

0

Datei anzeigen

Datei: spider.py Projekt: Yappawu/tigerspider

class LvYouDaoDaoSpider(BaseSpider):
    """用于抓取道道旅游信息的爬虫
    """
    parsers = {
        "AttractionListParser": AttractionListParser,
        "AttractionParser": AttractionParser,
        "DescriptionParser": DescriptionParser,
    }

    pipelines = {
        "AttractionItem": AttractionItemPipeline,
        "DescriptionItem": DescriptionItemPipeline,
    }

    start_tasks = [HttpTask(
        build_next_page_request(u"/Attractions-g308272-Activities-Shanghai.html"),
                            callback="AttractionListParser",)]

Beispiel #11

0

Datei anzeigen

 def parse(self, task, input_file):
     """解析函数
         Args:
             task: HttpTask，任务
             input_file: StringIO, 网页文件
         Yields:
             task: HttpTask, 新任务
     """
     tree = html.parse(input_file)
     attraction_elems = tree.xpath(
         "//div[@id='ATTRACTION_OVERVIEW']"
         "/div[@class='attraction-list clearfix']")
     for attraction_elem in attraction_elems:
         try:
             info_elem = flist(
                 attraction_elem.xpath(
                     "div[@class='clearfix']/div[@class='info']"), None)
             rank_elem = flist(
                 attraction_elem.xpath(
                     "div[@class='clearfix']/div[@class='rank']"), None)
             relate_path = flist(
                 info_elem.xpath("div[@class='title']/a/@href"), u"")
             name = flist(info_elem.xpath("div[@class='title']/a/text()"),
                          u"")
             address = _extract_address(info_elem)
             hot = flist(rank_elem.xpath("a/strong/text()"), u"")
             rank = flist(rank_elem.xpath("span[1]/strong/text()"), u"")
             # 形成attraction 任务
             http_request = build_attraction_request(relate_path)
             attraction_task = HttpTask(http_request,
                                        callback="AttractionParser",
                                        max_fail_count=3,
                                        cookie_host=LVYOU_HOST,
                                        kwargs={
                                            "name": unicode(name).strip(),
                                            "address": unicode(address),
                                            "hot": unicode(hot),
                                            "rank": unicode(rank)
                                        })
             yield attraction_task
         except Exception, e:
             self.logger.warn("extract one attraction failed error:%s" % e)

Beispiel #12

0

Datei anzeigen

class LvYouBaiDuSpider(BaseSpider):
    """抓取百度旅游的爬虫
    """
    parsers = {
        "AttractionListParser": AttractionListParser,
        "AttractionParser": AttractionParser,
        "CommentListParser": CommentListParser,
    }

    pipelines = {
        "AttractionItem": AttractionItemPipeline,
        "CommentListItem": CommentListItemPipeline,
    }

    start_tasks = [
        HttpTask(build_next_page_request("shanghai", 1),
                 callback="AttractionListParser",
                 max_fail_count=5,
                 cookie_host=LVYOU_HOST)
    ]

Beispiel #13

0

Datei anzeigen

    def parse(self, task, input_file):
        """解析函数
            Args:
                task: HttpTask, 请求任务
                input_file: StringIO, 网页stringIO
            Yields:
                task: HttpTask, 任务
                item: Item, 解析结果
        """
        self.logger.debug("start parse tag list parser")
        try:
            json_data = json.load(input_file)
            city_name = json_data['data']['surl']
            current_page = int(json_data['data']['current_page'])
            scene_list = json_data['data']['scene_list']
            total_scene = int(json_data['data']['scene_total'])
            current_cid = json_data['data']['current_cid']
            tag = task.kwargs.get('tag', u"")
            for index, scene in enumerate(scene_list):
                sid = scene['sid']
                yield TagItem(tag, current_cid, sid)

            # 生成 下一页任务
            if current_page * EVERY_PAGE_SCENE_COUNT < total_scene:
                # 有下一页, 生成下一个request请求
                next_request = build_next_tag_page_request(
                    current_cid, current_page + 1, city_name)
                next_tag_task = HttpTask(next_request,
                                         callback="TagListParser",
                                         max_fail_count=5,
                                         cookie_host=LVYOU_HOST,
                                         kwargs={'tag': tag})
                yield next_tag_task

        except Exception, e:
            self.logger.info("json loads error:%s for url:%s" %
                             (e, task.request.url))
            raise e

Beispiel #14

0

Datei anzeigen

class NuomiSpider(BaseSpider):
    """团购类糯米网数据抓取
    """
    parsers = {
        'CityParser': CityParser,
        'DealParser': DealParser,
        'PictureParser': PictureParser,
    }

    pipelines = {
        'CityItem': EmptyPipeline,
        'DealItem': DealItemPipeline,
        'PictureItem': PictureItemPipeline,
    }

    start_tasks = [
        HttpTask(HTTPRequest(url='http://www.nuomi.com/help/api',
                             connect_timeout=10,
                             request_timeout=20),
                 callback='CityParser',
                 max_fail_count=8,
                 kwargs={}),
    ]

Beispiel #15

0

Datei anzeigen

def build_rooms_task_for_hotel(hotel_requests, city_code, chinese_name,
                               hotel_addresses):
    """build room task for hotel

        Args:
            hotel_requests: list, [(hotel_code, city_code, chinese_name)]
            city_code: str, city code of tigerknows
            chinese_name: str, chinese name of city
            hotel_addresses: dict, hotel address dict
        Returns:
            task: HttpTask, new task for hotel search
    """
    timestamp = int(time.time())

    request_info_xml = "".join([
        """<HotelDescriptiveInfo HotelCode="%s"
    PositionTypeCode="502">
    <HotelInfo SendData="true"/><FacilityInfo SendGuestRooms="true"/>
    <AreaInfo SendAttractions="false" SendRecreations="false"/>
    <ContactInfo SendData="false"/><MultimediaObjects SendData="true"/>
    </HotelDescriptiveInfo>""" % hotel_code for hotel_code in hotel_requests
    ])

    request_xml = """<?xml version="1.0" encoding="utf-8"?><Request>
    <Header  AllianceID="%s" SID="%s" TimeStamp="%s"  RequestType="%s"
    Signature="%s" />
    <HotelRequest><RequestBody xmlns:ns="http://www.opentravel.org/OTA/2003/05"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xmlns:xsd="http://www.w3.org/2001/XMLSchema">
    <OTA_HotelDescriptiveInfoRQ Version="1.0"
    xsi:schemaLocation="http://www.opentravel.org/OTA/2003/05
    OTA_HotelDescriptiveInfoRQ.xsd" xmlns="http://www.opentravel.org/OTA/2003/05"
    xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance">
    <HotelDescriptiveInfos>%s</HotelDescriptiveInfos></OTA_HotelDescriptiveInfoRQ>
    </RequestBody></HotelRequest></Request>""" % (
        ALLIANCE_ID, SID, timestamp, "OTA_HotelDescriptiveInfo",
        _create_signature(timestamp, ALLIANCE_ID, SID,
                          "OTA_HotelDescriptiveInfo",
                          API_KEY), request_info_xml)

    post_xml = """<?xml version="1.0" encoding="utf-8"?>
    <soap:Envelope xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance"
    xmlns:xsd="http://www.w3.org/2001/XMLSchema"
    xmlns:soap="http://schemas.xmlsoap.org/soap/envelope/">
    <soap:Body><Request xmlns="http://ctrip.com/">
    <requestXML>%s</requestXML></Request></soap:Body></soap:Envelope>""" \
               % escape(request_xml)

    http_request = HTTPRequest(
        "http://%s/Hotel/OTA_HotelDescriptiveInfo.asmx" % API_URL,
        method="POST",
        body=post_xml,
        connect_timeout=20,
        request_timeout=360,
        headers={
            "SOAPAction": "http://ctrip.com/Request",
            "Content-Type": "text/xml; charset=utf-8"
        })
    return HttpTask(http_request,
                    callback="HotelParser",
                    max_fail_count=5,
                    kwargs={
                        "citycode": city_code,
                        "chinesename": chinese_name,
                        "address": hotel_addresses
                    })

Beispiel #16

0

Datei anzeigen

    def parse(self, task, input_file):
        """解析函数
            Args:
                task: HttpTask，任务
                input_file: StringIO, 网页文件
            Yields:
                item
        """
        self.logger.debug("attraction parser start to parse")
        content = input_file.read()
        tree = html.parse(StringIO(content))
        try:
            zip_code = flist(tree.xpath("//span[@class='postal-code']/text()"),
                             u"")
            play_spend, play_spend_unit = _extract_play_spend_and_unit(content)
            tel_phone = flist(
                tree.xpath("//div[@id='HEADING_GROUP']"
                           "/div[@class='wrap infoBox']"
                           "/div[@class='odcHotel blDetails']"
                           "/div/div[@class='fl']/text()"), u"")
            open_time = u""
            total_score = flist(
                tree.xpath("//div[@class='rs rating']"
                           "/span/img/@content"))
            ticket_info = u""
            preview_relate_path = flist(
                tree.xpath("//div[@class='listing_description']/a/@href"), u"")
            lon, lat = _extract_lon_lat(
                flist(
                    tree.xpath("//div[@class='js_mapThumb']"
                               "/div[@id='bmapContainer']/img[1]/@src"), u""))
            comments = _extract_comments(tree)
            # 生成景点信息(不包括description)
            attraction_item = AttractionItem(task.request.url,
                                             task.kwargs['name'],
                                             unicode(play_spend),
                                             play_spend_unit,
                                             task.kwargs['address'],
                                             unicode(tel_phone),
                                             unicode(open_time),
                                             unicode(total_score),
                                             unicode(ticket_info),
                                             task.kwargs['hot'], lon, lat,
                                             task.kwargs['rank'], comments,
                                             unicode(zip_code))
            yield attraction_item

            # 生成description任务
            if len(preview_relate_path) != 0:
                description_request = build_description_request(
                    task.request.url, preview_relate_path)
                description_task = HttpTask(description_request,
                                            callback="DescriptionParser",
                                            max_fail_count=3,
                                            cookie_host=LVYOU_HOST,
                                            kwargs={'url': task.request.url})
                yield description_task
            else:
                yield DescriptionItem(task.request.url, u"")

        except Exception, e:
            print "error:%s" % e
            print traceback.format_exc()

Beispiel #17

0

Datei anzeigen

class AttractionListParser(BaseParser):
    """用于解析Attraction list的解析器
    """
    def __init__(self, namespace):
        BaseParser.__init__(self, namespace)
        self.logger.info("init attraction list parser finish")

    def parse(self, task, input_file):
        """解析函数
            Args:
                task: HttpTask，任务
                input_file: StringIO, 网页文件
            Yields:
                task: HttpTask, 新任务
        """
        tree = html.parse(input_file)
        attraction_elems = tree.xpath(
            "//div[@id='ATTRACTION_OVERVIEW']"
            "/div[@class='attraction-list clearfix']")
        for attraction_elem in attraction_elems:
            try:
                info_elem = flist(
                    attraction_elem.xpath(
                        "div[@class='clearfix']/div[@class='info']"), None)
                rank_elem = flist(
                    attraction_elem.xpath(
                        "div[@class='clearfix']/div[@class='rank']"), None)
                relate_path = flist(
                    info_elem.xpath("div[@class='title']/a/@href"), u"")
                name = flist(info_elem.xpath("div[@class='title']/a/text()"),
                             u"")
                address = _extract_address(info_elem)
                hot = flist(rank_elem.xpath("a/strong/text()"), u"")
                rank = flist(rank_elem.xpath("span[1]/strong/text()"), u"")
                # 形成attraction 任务
                http_request = build_attraction_request(relate_path)
                attraction_task = HttpTask(http_request,
                                           callback="AttractionParser",
                                           max_fail_count=3,
                                           cookie_host=LVYOU_HOST,
                                           kwargs={
                                               "name": unicode(name).strip(),
                                               "address": unicode(address),
                                               "hot": unicode(hot),
                                               "rank": unicode(rank)
                                           })
                yield attraction_task
            except Exception, e:
                self.logger.warn("extract one attraction failed error:%s" % e)
        # 形成下一页任务
        next_page_relate = flist(
            tree.xpath(
                "//div[@class='pagination']/div"
                "/a[@class='next sprite-arrow-right-green ml6 ']/@href"), u"")
        if len(next_page_relate) != 0:
            next_page_request = build_next_page_request(next_page_relate)
            next_page_task = HttpTask(next_page_request,
                                      callback="AttractionListParser",
                                      max_fail_count=5,
                                      cookie_host=LVYOU_HOST)
            yield next_page_task

Beispiel #18

0

Datei anzeigen

Datei: parser.py Projekt: Yappawu/tigerspider

    def parse(self, task, input_file):
        """解析函数
            Args:
                task: HTTPTask, 任务
                input_file: StringIO, 网页信息
            Yields:
                task: HTTPTask, 任务
                item: Item, 解析的结果
        """
        self.logger.debug("attraction parser start to parse")
        parser = html.HTMLParser(encoding='utf-8')
        tree = html.parse(input_file, parser)
        try:
            name = flist(
                tree.xpath("//header[@class='title-head']/a/p/text()"), u"")
            play_spend, play_spend_unit = _extract_play_spend(tree)
            address = flist(
                tree.xpath("//div[@id='J-aside-info-address']"
                           "/span[@class='val address-value']"
                           "/text()"), u"")
            tel_phone = flist(
                tree.xpath("//div[@id='J-aside-info-phone']"
                           "/span[@class='val phone-value']"
                           "/text()"), u"")
            time_elems = tree.xpath("//div[@id='J-aside-info-opening_hours']"
                                    "/div[@class='val opening_hours-value']/p")
            time_list = []
            for time_elem in time_elems:
                time_list.append(time_elem.text)
            open_time = "".join(time_list)
            total_score = flist(
                tree.xpath("//div[@class='scene-rating']"
                           "/div/@content"), u"")
            ticket_info = flist(
                tree.xpath("//div[@id='J-aside-info-price']"
                           "/div[@class='val price-value']"
                           "/p/text()"), u"")
            preview = _extract_preview(tree)
            traffic = _extract_traffic(tree)
            tips = _extract_tips(tree)
            hot = flist(
                tree.xpath("//section[@id='remark-container']"
                           "/div[@class='remark-overall-rating']"
                           "/span[@class='remark-all-counts']"
                           "/text()"), u"")
            lon_lat = task.kwargs['map_info'].split(",")
            if len(lon_lat) <= 1:
                lon, lat = u"", u""
            else:
                lon, lat = lon_lat[0], lon_lat[1]
            seq_sort = task.kwargs['seq_sort']
            sid = task.kwargs['sid']
            attraction_item = AttractionItem(unicode(sid), unicode(name),
                                             unicode(play_spend),
                                             unicode(play_spend_unit),
                                             unicode(address),
                                             unicode(tel_phone),
                                             unicode(open_time),
                                             unicode(total_score),
                                             unicode(ticket_info),
                                             unicode(preview), unicode(hot),
                                             unicode(lon), unicode(lat),
                                             unicode(seq_sort),
                                             unicode(traffic), unicode(tips))
            yield attraction_item

            # yield comment list task
            comments_request = build_comment_list_request(
                sid, task.kwargs['relate_path'])
            comments_task = HttpTask(comments_request,
                                     callback="CommentListParser",
                                     max_fail_count=3,
                                     cookie_host=LVYOU_HOST,
                                     kwargs={'sid': sid})
            yield comments_task
        except Exception, e:
            self.logger.error("extract Attraction failed error:%s" % e)
            self.logger.error("error traceback:%s" % traceback.format_exc())
            raise e