Ejemplo n.º 1
0
    def __setattr__(self, key, value):
        if key == '_BaseModel__columns_dict': return
        column = self._BaseModel__columns_dict.get(key, None)
        if not column:
            raise KeyError(str(key))
        if not column.judgement_type(value):
            raise TypeError('%s must be %s' % (value, column._typ))
        if not key_is_legal(value):
            self.__dict__[key] = column._default

        self.__dict__[key] = value
 def _execute(self, **kwargs):
     address = self.task.kwargs['description']
     self.logger.debug('-'*20)
     map_info = google_get_map_info_zxp(address)
     self.logger.debug('{0} --- {1}'.format(address, map_info))
     data_collections = client['CitySuggest'][self.task.task_name]
     # data_collections.create_index([('description', 1)], unique=True, background=True)
     content = deepcopy(self.task.kwargs)
     content['map_info'] = map_info
     self.logger.debug(map_info)
     map_info_is_legal = True
     try:
         lon, lat = map_info.split(',')
         if float(lon) == 0.0 and float(lat) == 0.0:
             map_info_is_legal = False
     except Exception as e:
         map_info_is_legal = False
         self.logger.exception(msg="[map info is not legal]", exc_info=e)
     self.logger.debug('*'*20)
     if key_is_legal(map_info) and map_info_is_legal:
         data_collections.insert(content)
         self.task.error_code = 0
     else:
         self.task.error_code = 12
Ejemplo n.º 3
0
def booking_parser(content, url, other_info):
    hotel = BookingHotel()
    #print 'url=>%s' % url
    # #print url
    try:
        content = str(content).decode('utf-8')
        root = HTML.fromstring(content)
    except:
        #print str(e)
        pass

    try:
        source_city_id = re.findall(r'params.context_dest_id = \'([-+]?\d+)\'',
                                    content)[0]
        hotel.source_city_id = source_city_id.encode('utf8')
    except:
        #print e
        pass

    #print 'source_city_id=>%s' % hotel.source_city_id

    # 解析酒店中英文名,如果没有中文名则置为英文名,如果都解析失败则退出
    # try:
    #     name_temp = root.xpath('//*[@class="hp__hotel-name"]')[
    #         0].text_content().strip('\t').strip('\n')
    #
    #     temp = re.findall(ur'([\u4e00-\u9fa5])*', name_temp)
    #     zh_name_tmep = [t for t in temp if t and t!=' ']
    #     if len(zh_name_tmep)>0:
    #         hotel.hotel_name = zh_name_tmep[0].encode('utf8')
    #     else:
    #         hotel.hotel_name = ''
    #
    #     if not zh_name_tmep:
    #         hotel.hotel_name_en = name_temp.strip(')').strip('(').strip(')').strip('(').strip().encode('utf8')
    #     else:
    #         name_en_temp = name_temp[:name_temp.find(zh_name_tmep[0][0])] + name_temp[
    #                                                                         name_temp.find(zh_name_tmep[0][-1])+1:]
    #         hotel.hotel_name_en = name_en_temp.strip(')').strip('(').strip(')').strip('(').strip().encode('utf8')
    # except:
    #     #print e
    try:
        name_temp = root.xpath('//*[@class="hp__hotel-name"]')[0].text_content(
        ).strip('\t').strip('\n')
        temp_name = name_temp.split('(')
        if len(temp_name) == 1:
            temp_name = name_temp.split('(')
        if len(temp_name) == 2:
            hotel.hotel_name_en, hotel.hotel_name = temp_name[0].encode(
                'utf8'), temp_name[1].strip(')').strip(')').encode('utf8')
        elif len(temp_name) == 1:
            temp = re.findall(r'\w+', temp_name[0])
            if len(temp) == 0:
                hotel.hotel_name = temp_name[0].encode('utf8')
                hotel.hotel_name_en = 'NULL'
            else:
                hotel.hotel_name = 'NULL'
                hotel.hotel_name_en = temp_name[0].encode('utf8')
    except:
        #print e
        pass

    # try:
    #         name_temp = root.xpath('//*[@class="sr-hotel__name"]/text()')[
    #             0].strip().encode('utf-8')
    #
    #         hotel.hotel_name_en = re.split('(')[0].replace('"',
    #                                                               '""').strip()
    #         #print 'hotel.hotel_name_en=>%s' % hotel.hotel_name_en
    #         # #print hotel.hotel_name_en
    #     except:
    #         try:
    #             name_temp = root.xpath('//div[@id="b_mainContent"]/h1/text()')[
    #                 0].strip().encode('utf-8')
    #             hotel.hotel_name = name_temp
    #         except:
    #             #print '----------', str(e)
    #             # return hotel_tuple
    #             # #print 'vvvvvvvvvvvvvvvvv'
    #print 'hotel.hotel_name=>%s' % hotel.hotel_name
    #print 'hotel.hotel_name_en=>%s' % hotel.hotel_name_en
    # #print hotel.hotel_name
    # 解析酒店品牌名称
    try:
        hotel.brand_name = brand_pat.findall(content)[0].strip().replace(
            '"', '""')
    except:
        # #print str(e)
        hotel.brand_name = 'NULL'
    #print 'brad_name=>%s' % hotel.brand_name
    # #print hotel.brand_name
    try:
        pp = root.xpath('//*[@class="map_static_zoom_images"]/img/@src'
                        )[0].strip().encode('utf-8')
        try:
            map_infos = str(map_pat.findall(pp)[0])
            hotel.map_info = map_infos.split(',')[1] + ',' + map_infos.split(
                ',')[0]
        except:
            #print 'vvvvv\n'
            pass
    except:
        try:
            map_infos = root.xpath(
                '//span[contains(@class , "hp_address_subtitle")]/@data-bbox'
            )[0].strip().split(',')
            hotel.map_info = str(
                float(float(map_infos[0]) + float(map_infos[2])) /
                2.0) + ',' + str(
                    float(float(map_infos[1]) + float(map_infos[3])) / 2.0)
        except:
            #print str(e)
            try:
                # map_infos = root.xpath('//span[@itemprop="address"]/@data-bbox')[0].split(',')
                map_infos = root.xpath(
                    '//span[contains(@class , "hp_location_address_line")]/@data-bbox'
                )[0].strip().split(',')
                hotel.map_info = str(
                    float(float(map_infos[0]) + float(map_infos[2])) /
                    2.0) + ',' + str(
                        float(float(map_infos[1]) + float(map_infos[3])) / 2.0)
            except:
                map_infos = root.xpath('//a[@id="show_map"]/@data-coords')
                if map_infos:
                    map_infos = str(map_infos[0]).split(',')
                    hotel.map_info = map_infos[0] + ',' + map_infos[1]
                else:
                    lat_tmp = re.findall(
                        'booking.env.b_map_center_latitude = (-+\d+.\d+);',
                        content)
                    latitude = lat_tmp[0] if len(lat_tmp) > 0 else 0
                    lon_tmp = re.findall(
                        'booking.env.b_map_center_longitude = (-+\d+.\d+);',
                        content)
                    longitude = lon_tmp[0] if len(lon_tmp) > 0 else 0
                    hotel.map_info = '{0},{1}'.format(longitude, latitude)
                #print str(e)
    #print 'map_info=>%s' % hotel.map_info

    # #print hotel.map_info
    # 解析酒店地址
    try:
        # hotel.address = root.get_element_by_id('hp_address_subtitle') \
        #     .xpath('text()')[0].encode('utf-8').strip().replace('"', '""')
        strs = root.xpath(
            '//span[contains(@class, "hp_address_subtitle")]/text()')
        hotel.address = strs[0].encode('utf-8').strip().replace('"', '""')
    except:
        strs = root.xpath(
            '//span[contains(@class, "hp_location_address_line")]/text()')
        if len(strs):
            hotel.address = strs[0].encode('utf-8').strip().replace('"', '""')
        else:
            try:
                adress_temp = root.xpath(
                    '//p[@class="b_hotelAddress"]//text()')
                adress_temp = ' '.join(
                    map(lambda x: x.replace('\n', ''), adress_temp))
                hotel.address = adress_temp.replace('显示地图', '')
            except:
                #print e
                pass

    #print 'address=>%s' % hotel.address
    # #print hotel.address
    # 解析酒店星级
    hotel.star = -1
    try:
        star_temp = root.find_class('hp__hotel_ratings__stars')[0].xpath(
            'i/@class')[0].strip()
        hotel.star = num_pat.findall(star_temp)[0]
        hotel.star = int(hotel.star)
    except:
        try:
            star_title = root.xpath(
                '//*[@class="nowrap hp__hotel_ratings"]//span[@class="invisible_spoken"]/text()'
            )
            if star_title:
                star = re.findall('(\d+)', star_title[0])
                if star:
                    hotel.star = int(star[0])

            # 当初先非官方评定 start 时,使用 svg 中的 class 获取星级
            if hotel.star == -1:
                star_svg = root.xpath(
                    '//*[@class="nowrap hp__hotel_ratings"]//svg/@class')
                if star_svg:
                    star = re.findall('-sprite-ratings_circles_(\d+)',
                                      star_svg[0])
                    if star:
                        hotel.star = int(star[0])
            if hotel.star == -1:
                star_svg = root.xpath(
                    '//span[@class="hp__hotel_ratings__stars"]//svg[@class]')
                #print "star_svg:", star_svg
                if star_svg:
                    hotel.star = int(
                        re.search(r'\d+',
                                  star_svg[0].attrib.get('class')).group(0))

        except:
            pass
    #print 'star=>%s' % hotel.star
    # #print hotel.star
    # 解析酒店评分
    try:
        grade_temp = root.xpath(
            '//div[contains(@class, "hotel_large_photp_score")]/@data-review-score'
        )
        hotel.grade = str(grade_temp[0])
    except:
        try:
            grade_temp = root.xpath('//div[@id="review_block_top"]/text()'
                                    )[1].strip().encode('utf-8')
            hotel.grade = grade_temp
        except:
            hotel.grade = 'NULL'
    #print 'grade=>%s' % hotel.grade
    # #print hotel.grade
    # 解析酒店评论数
    try:
        score = re.findall(
            '(\d+)',
            root.xpath(
                '//a[@class="hp_nav_reviews_link toggle_review track_review_link_zh"]/span/text()'
            )[1])[0]
        # re_start = root.find_class('trackit score_from_number_of_reviews')[0].xpath('strong/text()') \
        #     [0].encode('utf-8').strip()
        # hotel.review_num = re_start
        hotel.review_num = score.encode('utf8')
    except:
        #print e
        try:
            re_start = root.xpath(
                '//div[@class="location_score_tooltip"]/p[1]/small/strong/text()'
            )[0]
            hotel.review_num = int(re_start)
        except:
            try:
                re_num = root.xpath('//div[@id="review_block_top"]/text()'
                                    )[0].strip().encode('utf-8')
                re_num = re.findall(r'\d+', re_num)[0]
                hotel.review_num = re_num
            except:
                #print e
                hotel.review_num = -1
    #print 'review_num=>%s' % hotel.review_num
    # #print hotel.review_num
    # 解析酒店简介
    try:
        hotel.description = root.get_element_by_id('summary') \
            .text_content().encode('utf-8').strip().replace('"', '""').replace('\n', '').replace(
            '抱歉,该住宿简介暂无您所选择的语言版本,目前正在更新中。', '')
        infos = root.xpath(
            '//div[@class="hotel_description_wrapper_exp hp-description"]/p[@class="geo_information"]/text()'
        )
        if len(infos):
            hotel.description += infos[0].strip().replace('\r', '').replace(
                '\n', '')
    except:
        try:
            desc = root.xpath('//div[@class="b_hotelDescription"]/p/text()')
            desc = ''.join(desc)
            hotel.description = desc
        except:
            hotel.description = 'NULL'
    #print 'description=>%s' % hotel.description
    # #print hotel.description
    # 解析酒店接受的银行卡
    try:
        card_list = root.find_class('creditcard')
        card_str_list = []
        hotel.accepted_cards = ''
        for each_card_ele in card_list:
            card_str_list.append(each_card_ele.attrib['class'].replace('creditcard', '') \
                                 .strip())

        for each_card_str in set(card_str_list):
            hotel.accepted_cards += each_card_str + '|'

        hotel.accepted_cards = hotel.accepted_cards[:-1].replace('"', '""')
        if not len(card_list):
            try:
                card_list = root.xpath(
                    '//div[@class="description"]/ul/li/text()')[:-1]
                hotel.accepted_cards = '|'.join(card_list)
            except:
                hotel.accepted_cards = 'NULL'
    except:
        try:
            card_list = root.xpath(
                '//div[@class="description"]/ul/li/text()')[:-1]
            hotel.accepted_cards = '|'.join(card_list)
        except:
            hotel.accepted_cards = 'NULL'

    #print 'accepted_card=>%s' % hotel.accepted_cards
    # #print hotel.accepted_cards

    # parse check_in time info
    try:
        hotel.check_in_time = root.get_element_by_id('checkin_policy').text_content() \
            .encode('utf-8').strip().replace('\n', ' ').replace('"', '""')
    except:
        try:
            check_in_time = root.xpath(
                '//div[@class="description"]/p[1]/text()')
            hotel.check_in_time = check_in_time[0].strip().encode('utf-8')
        except:
            hotel.check_in_time = 'NULL'

    # parse check out time info
    try:
        hotel.check_out_time = root.get_element_by_id('checkout_policy').text_content() \
            .encode('utf-8').strip().replace('\n', ' ').replace('"', '""')
    except:
        try:
            check_out_time = root.xpath(
                '//div[@class="description"]/p[2]/text()')
            hotel.check_out_time = check_out_time[0].strip().encode('utf-8')
        except:
            hotel.check_out_time = 'NULL'

    #print 'checkintime=>%s' % hotel.check_in_time
    # #print hotel.check_in_time

    #print 'checkouttime=>%s' % hotel.check_out_time
    # #print hotel.check_out_time
    # parse all services at this hotel
    try:
        service_temp_list = root.get_element_by_id('hp_facilities_box').xpath(
            'div')
        servce_ele_parents = []
        for each_service_parent in service_temp_list:
            try:
                if 'facilities' in each_service_parent.attrib['class']:
                    service_parent = each_service_parent
                    break
            except:
                continue
        service_ele_list = root.xpath(
            '//div[@class="facilitiesChecklistSection"]')
        service_ele_list.extend(
            root.xpath('//div[@class="facilitiesChecklistSection\n"]'))
        hotel.service = ''
        for each_service_ele in service_ele_list:
            try:
                service_item_name = each_service_ele.xpath('h5/text()')
                for s in service_item_name:
                    if s != '\n':
                        service_item_name = s.strip().decode('utf-8')
                        break
                service_items = each_service_ele.xpath(
                    './ul/li//text()') or each_service_ele.xpath(
                        './div/ul/li/text()')  # \
                service_items = [
                    x for x in map(
                        lambda x: x.replace('\n', '').strip().encode('utf-8'),
                        service_items) if x
                ]
                service_temp = '|'.join(service_items)
                service_temp = service_item_name + '::' + service_temp + '|'
                if '停车场' in service_temp:
                    hotel.has_parking = 'Yes'
                    if ('付费' or '收费') in service_temp:
                        hotel.is_parking_free = 'No'
                    elif '免费' in service_temp:
                        hotel.is_parking_free = 'Yes'
                if 'WiFi' in service_temp:
                    hotel.has_wifi = 'Yes'
                    if ('付费' or '收费') in service_temp:
                        hotel.is_wifi_free = 'No'
                    elif '免费' in service_temp:
                        hotel.is_wifi_free = 'Yes'
                hotel.service += service_temp.encode('utf-8')
            except:
                '''There is a pit, I step on, the next person to continue'''
                continue
        # :-1 delete one |
        hotel.service = hotel.service[:
                                      -1]  # .replace('||','|').replace('"','""').encode('utf-8')
    except:
        try:
            elements = root.xpath('//div[@class="hotel_facilities_block"]')
            service = ''
            for s in elements:
                con = s.xpath('./ul[@class="b_newHotelFacilities"]/li/text()'
                              ) or s.xpath(
                                  './p[@class="b_hotelFacilities"]/text()')
                if not len(con):
                    continue
                title = s.xpath('./h3/text()')[0].strip().encode('utf-8')
                temp = title + '::' + '|'.join(
                    map(lambda x: x.strip().encode('utf-8'), con)) + '|'
                if '停车场' in temp:
                    hotel.has_parking = 'Yes'
                    if ('付费' or '收费') in temp:
                        hotel.is_parking_free = 'No'
                    elif '免费' in temp:
                        hotel.is_parking_free = 'Yes'
                if 'WiFi' in temp:
                    hotel.has_wifi = 'Yes'
                    if ('付费' or '收费') in temp:
                        hotel.is_wifi_free = 'No'
                    elif '免费' in temp:
                        hotel.is_wifi_free = 'Yes'
                service += temp.encode('utf-8')
            hotel.service = service[:-1]
        except:
            hotel.service = 'NULL'

    #print 'service=>%s' % hotel.service
    # #print hotel.service
    #print 'hotel.has_parking=>%s' % hotel.has_parking
    # #print hotel.has_parking
    #print 'hotel.is_parking_free=>%s' % hotel.is_parking_free
    # #print hotel.is_parking_free
    #print 'has_wifi=>%s' % hotel.has_wifi
    # #print hotel.has_wifi
    #print 'is_wifi_free=>%s' % hotel.is_wifi_free
    # #print hotel.is_wifi_free

    # parse all photos link of this hotel
    # try:
    #     hotel.img_items = ''
    #     image_list = root.xpath('//div[@id="photos_distinct"]/a/@href')
    #     for each_img_link in image_list:
    #         hotel.img_items += each_img_link.encode('utf-8') + '|'
    #     hotel.img_items = hotel.img_items[:-1].replace('"', '').encode('utf-8')
    # except:
    #     #print "kkkk"
    # new img func
    # if hotel.img_items == '':
    try:
        hotelPhoto_str = re.findall('hotelPhotos:([\s\S]+?)]',
                                    content)[0] + ']'
        hotel.img_items = '|'.join(
            map(lambda x: x.replace('\'', '').strip() + '.jpg',
                re.findall('large_url:([\s\S]+?).jpg', hotelPhoto_str)))
    except:
        pass

    first_img = None
    if not key_is_legal(hotel.img_items):
        try:
            hotels_class = root.find_class(
                'hp-gallery-slides hp-gallery-top')[0]
            img_src = hotels_class.xpath('.//img/@src')[0]
            img_lazy = hotels_class.xpath('.//img/@data-lazy')
            img_items = img_src + '|' + '|'.join(img_lazy)
            first_img = img_src
            hotel.img_items = img_items
        except:
            try:
                img_items = root.xpath('//div[@id="b_imgList"]/ul/li/a/@href')
                img_items = '|'.join(img_items)
                hotel.img_items = img_items
            except:
                hotel.img_items = 'NULL'

            try:
                first_img = root.xpath(
                    '//a[contains(@class, "active-image")]/img/@src')[0]
                if not hotel.img_items:
                    hotel.img_items += first_img
            except:
                #print e
                pass

    #print 'img_item=>%s' % hotel.img_items
    #print 'first_img=>%s' % first_img
    # #print hotel.img_items
    hotel.source = 'booking'
    hotel.hotel_url = url.encode('utf-8')
    if other_info.get('hid'):
        hotel.source_id = re.search("b_hotel_id: ?'(-?\d+)'",
                                    content).groups()[0]
        # hotel.source_id = re.search('dest_id=(-?\d+)', content).groups()[0]
    else:
        hotel.source_id = other_info['source_id']
    hotel.city_id = other_info['city_id']

    if first_img:
        hotel.others_info = json.dumps({
            'first_img': first_img,
            'hid': other_info.get('hid', 'NULL')
        })

    return hotel
Ejemplo n.º 4
0
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
            city_id = self.task.kwargs['city_id']
            target_url = self.task.kwargs['target_url']
            headers = {'Host': 'place.qyer.com'}
            page = session.get(target_url, headers=headers, timeout=240)
            page.encoding = 'utf8'
            content = page.text

            if '请输入验证码' in content:
                raise Exception("请输入验证码")

            result = page_parser(content=content, target_url=target_url)
            result.city_id = city_id
            name = result.name
            name_en = result.name_en
            map_info = result.map_info
            address = result.address

            map_info_is_legal = True
            try:
                lon, lat = map_info.split(',')
                if float(lon) == 0.0 and float(lat) == 0.0:
                    map_info_is_legal = False
            except Exception as e:
                map_info_is_legal = False
                logger.exception(msg="[map info is not legal]", exc_info=e)

            if not key_is_legal(map_info) or not map_info_is_legal:
                if not key_is_legal(address):
                    # todo 临时注释
                    pass
                    # raise TypeCheckError(
                    #     'Error map_info and address NULL        with parser %ss    url %s' % (
                    #         page_parser.func_name, target_url))
                google_map_info = google_get_map_info(address)
                if not key_is_legal(google_map_info):
                    # todo 临时注释
                    pass
                    # raise TypeCheckError(
                    #     'Error google_map_info  NULL  with [parser: {}][url: {}][address: {}][map_info: {}]'.format(
                    #         page_parser.func_name, target_url, address, map_info)
                    # )
                result.map_info = google_map_info

            if key_is_legal(name) or key_is_legal(
                    name_en) or map_info_is_legal or key_is_legal(
                        result.introduction):
                logger.info(name + '  ----------  ' + name_en)
            else:
                # raise TypeCheckError(
                #     'Error name and name_en Both NULL        with parser %s    url %s' % (
                #         page_parser.func_name, target_url))
                raise TypeCheckError("All Available Key is Null")

        sql_result = result.__dict__
        sql_key = sql_result.keys()
        if '_sa_instance_state' in sql_key:
            sql_key.remove('_sa_instance_state')

        try:
            session = DBSession()
            session.execute(
                text(
                    text_2_sql(sql_key).format(
                        table_name=self.task.task_name)), [sql_result])
            session.commit()
            session.close()
        except Exception as e:
            self.logger.exception(msg="[mysql exec err]", exc_info=e)
            raise ServiceStandardError(
                error_code=ServiceStandardError.MYSQL_ERROR,
                wrapped_exception=e)

        self.task.error_code = 0
        return self.task.error_code
Ejemplo n.º 5
0
def parse_hotel(content, url, other_info, source, part, retry_count):
    function_dict = {
        'agoda': agoda_parser.agoda_parser,
        'booking': booking_parser.booking_parser,
        'ctrip': ctrip_parser.ctrip_parser,
        'elong': elong_parser.elong_parser,
        'expedia': expedia_parser.expedia_parser,
        'hotels': hotels_parser.hotels_parser,
        'hoteltravel': hoteltravel_parser.hoteltravel_parser,
        'hrs': hrs_parser.hrs_parser,
        'cheaptickets': expedia_parser.expedia_parser,
        'orbitz': expedia_parser.expedia_parser,
        'travelocity': expedia_parser.expedia_parser,
        'ebookers': expedia_parser.expedia_parser,
        'tripadvisor': tripadvisor_parser.tripadvisor_parser,
        'ctripcn': ctrip_cn_parser.ctrip_cn_parser,
        'hilton': hilton_parser.hilton_parser,
        'ihg': ihg_parser.ihg_parser,
        'holiday': holiday_parser.holiday_parser,
        'accor': accor_parser.accor_parser,
        'marriott': marriott_parser.marriott_parser
    }
    if source not in function_dict.keys():
        raise TypeCheckError(
            'Error Parser Source        with source %s   url %s ' %
            (source, url))

    parser = function_dict[source]
    result = parser(content, url, other_info)

    # key words check
    # logger.info('map_info  ++++++++    %s' % result.map_info)
    # if key_is_legal(result.map_info) and key_is_legal(result.address):
    if not key_is_legal(result.map_info):
        if retry_count > 3:
            if not key_is_legal(result.address):
                raise TypeCheckError(
                    'Error map_info and address NULL        with parser %ss    url %s'
                    % (parser.func_name, url))
            google_map_info = google_get_map_info(result.address)
            if not key_is_legal(google_map_info):
                raise TypeCheckError(
                    'Error google_map_info  NULL        with parser %ss    url %s'
                    % (parser.func_name, url))
            result.map_info = google_map_info
        else:
            raise TypeCheckError(
                'Error map_info NULL        with parser %ss    url %s' %
                (parser.func_name, url))

    if key_is_legal(result.hotel_name) or key_is_legal(result.hotel_name_en):
        logger.info(result.hotel_name + '  ----------  ' +
                    result.hotel_name_en)
    else:
        raise TypeCheckError(
            'Error hotel_name and hotel_name_en Both NULL        with parser %s    url %s'
            % (parser.func_name, url))

    if result.source == 'booking':
        # if not key_is_legal(result.hotel_name):
        #     raise TypeCheckError('booking has no hotel name        with parser %s    url %s' % (parser.func_name, url))
        # if not key_is_legal(result.hotel_name_en):
        #     raise TypeCheckError('booking has no hotel name en        with parser %s    url %s' % (parser.func_name, url))
        if not key_is_legal(result.img_items):
            raise TypeCheckError(
                'booking has no img        with parser %s    url %s' %
                (parser.func_name, url))

    if result.source == 'hotels':
        if not key_is_legal(result.img_items):
            raise TypeCheckError(
                'hotels has no img        with parser %s    url %s' %
                (parser.func_name, url))

    # if result.grade in ('NULL', '-1', ''):
    #     raise TypeError('Error Grade NULL')

    result.continent = part

    # expedia 五个源设置 source
    result.source = source

    # result 中 grade 修复
    if result.grade == 'NULL':
        result.grade = -1

    # 酒店全部字段繁体转简体
    keys = [
        'hotel_name', 'hotel_name_en', 'brand_name', 'address', 'service',
        'description', 'accepted_cards', 'check_in_time', 'check_out_time'
    ]

    for key in keys:
        if not getattr(result, key):
            setattr(result, key, 'NULL')
        try:
            setattr(result, key,
                    tradition2simple(getattr(result, key).decode()))
        except Exception as e:
            print('****', key, str(getattr(result, key)),
                  traceback.print_exc())
    return result
Ejemplo n.º 6
0
    def _execute(self, **kwargs):
        target_url = self.task.kwargs['target_url']
        city_id = self.task.kwargs['city_id']
        poi_type = self.task.kwargs['poi_type']

        target_url = target_url.replace('.com.hk', '.cn')
        with MySession(need_cache=True) as session:
            page = session.get(target_url, timeout=120)
            page.encoding = 'utf8'

            parser = parser_type[poi_type]
            result = parser(page.content, target_url, city_id=city_id)

            if result == 'Error':
                raise ServiceStandardError(ServiceStandardError.PARSE_ERROR)

            result['city_id'] = city_id
            # result['utime'] = datetime.datetime.now()
            sql_key = result.keys()

            name = result['name']
            # if name.find('停业') > -1:
            #     raise ServiceStandardError(error_code=ServiceStandardError.TARGET_CLOSED)
            name_en = result['name_en']
            map_info = result['map_info']
            address = result['address']

            map_info_is_legal = True
            try:
                lon, lat = map_info.split(',')
                if float(lon) == 0.0 and float(lat) == 0.0:
                    map_info_is_legal = False
            except Exception as e:
                map_info_is_legal = False
                logger.exception(msg="[map info is not legal]", exc_info=e)

            if not key_is_legal(map_info) or not map_info_is_legal:
                if not key_is_legal(address):
                    pass
                    # raise TypeCheckError(
                    #     'Error map_info and address NULL        with parser %ss    url %s' % (
                    #         parser.func_name, target_url))
                google_map_info = google_get_map_info(address)
                if not key_is_legal(google_map_info):
                    pass
                    # raise TypeCheckError(
                    #     'Error google_map_info  NULL  with [parser: {}][url: {}][address: {}][map_info: {}]'.format(
                    #         parser.func_name, target_url, address, map_info)
                    # )
                result['map_info'] = google_map_info
            if key_is_legal(name) or key_is_legal(
                    name_en) or map_info_is_legal or key_is_legal(
                        result.introduction):
                logger.info(name + '  ----------  ' + name_en)
            else:
                raise TypeCheckError(
                    'Error All Keys is None with parser %s  url %s' %
                    (parser.func_name, target_url))

            try:
                session = DBSession()
                session.execute(
                    text(
                        text_2_sql(sql_key).format(
                            table_name=self.task.task_name)), [result])
                session.commit()
                session.close()
            except Exception as e:
                logger.exception(e)
                raise ServiceStandardError(
                    error_code=ServiceStandardError.MYSQL_ERROR,
                    wrapped_exception=e)

            self.task.error_code = 0
            return self.task.error_code