def _execute(self, **kwargs):
        table_name = self.task.kwargs['table_name']
        source = self.task.kwargs['source']
        sid = self.task.kwargs['sid']
        other_info = self.task.kwargs['other_info']

        address = json.loads(other_info).get('address').encode('utf8')
        if not address:
            execute_sql(update_status % (2, source, sid, table_name))
            raise ServiceStandardError(
                error_code=ServiceStandardError.KEY_WORDS_FILTER,
                wrapped_exception=Exception(u'address 为空'))

        map_info = google_get_map_info(address)
        if not map_info:
            execute_sql(update_status % (2, source, sid, table_name))
            raise ServiceStandardError(
                error_code=ServiceStandardError.KEY_WORDS_FILTER,
                wrapped_exception=Exception(u'mapinfo 为空'))

        sql = update_map_info % (table_name, map_info, source, sid)
        typ2 = table_name.split('_')[1]
        sql = sql.format(field='source_id' if typ2 == 'hotel' else 'id')

        execute_sql(sql)
        execute_sql(update_status % (1, source, sid, table_name))

        self.task.error_code = 0
        return source, sid
def supplement_map_info(self, table_name, source, sid, other_info, **kwargs):
    task_response = kwargs['task_response']
    task_response.source = source.title()
    task_response.type = 'SupplementField'

    address = json.loads(other_info).get('address').encode('utf8')
    if not address:
        execute_sql(update_status % (2, source, sid, table_name))
        raise Exception(u'address 为空')

    map_info = google_get_map_info(address)
    if not map_info:
        execute_sql(update_status % (2, source, sid, table_name))
        raise Exception(u'mapinfo 为空')

    sql = update_map_info % (table_name, map_info, source, sid)
    typ2 = table_name.split('_')[1]
    sql = sql.format(field='source_id' if typ2 == 'hotel' else 'id')

    execute_sql(sql)
    execute_sql(update_status % (1, source, sid, table_name))

    task_response.error_code = 0
    return source, sid
Exemple #3
0
    def _execute(self, **kwargs):
        with MySession(need_cache=True, need_proxies=True) as session:
            city_id = self.task.kwargs['city_id']
            target_url = self.task.kwargs['target_url']
            headers = {'Host': 'place.qyer.com'}
            page = session.get(target_url, headers=headers, timeout=240)
            page.encoding = 'utf8'
            content = page.text

            if '请输入验证码' in content:
                raise Exception("请输入验证码")

            result = page_parser(content=content, target_url=target_url)
            result.city_id = city_id
            name = result.name
            name_en = result.name_en
            map_info = result.map_info
            address = result.address

            map_info_is_legal = True
            try:
                lon, lat = map_info.split(',')
                if float(lon) == 0.0 and float(lat) == 0.0:
                    map_info_is_legal = False
            except Exception as e:
                map_info_is_legal = False
                logger.exception(msg="[map info is not legal]", exc_info=e)

            if not key_is_legal(map_info) or not map_info_is_legal:
                if not key_is_legal(address):
                    # todo 临时注释
                    pass
                    # raise TypeCheckError(
                    #     'Error map_info and address NULL        with parser %ss    url %s' % (
                    #         page_parser.func_name, target_url))
                google_map_info = google_get_map_info(address)
                if not key_is_legal(google_map_info):
                    # todo 临时注释
                    pass
                    # raise TypeCheckError(
                    #     'Error google_map_info  NULL  with [parser: {}][url: {}][address: {}][map_info: {}]'.format(
                    #         page_parser.func_name, target_url, address, map_info)
                    # )
                result.map_info = google_map_info

            if key_is_legal(name) or key_is_legal(
                    name_en) or map_info_is_legal or key_is_legal(
                        result.introduction):
                logger.info(name + '  ----------  ' + name_en)
            else:
                # raise TypeCheckError(
                #     'Error name and name_en Both NULL        with parser %s    url %s' % (
                #         page_parser.func_name, target_url))
                raise TypeCheckError("All Available Key is Null")

        sql_result = result.__dict__
        sql_key = sql_result.keys()
        if '_sa_instance_state' in sql_key:
            sql_key.remove('_sa_instance_state')

        try:
            session = DBSession()
            session.execute(
                text(
                    text_2_sql(sql_key).format(
                        table_name=self.task.task_name)), [sql_result])
            session.commit()
            session.close()
        except Exception as e:
            self.logger.exception(msg="[mysql exec err]", exc_info=e)
            raise ServiceStandardError(
                error_code=ServiceStandardError.MYSQL_ERROR,
                wrapped_exception=e)

        self.task.error_code = 0
        return self.task.error_code
def parse_hotel(content, url, other_info, source, part, retry_count):
    function_dict = {
        'agoda': agoda_parser.agoda_parser,
        'booking': booking_parser.booking_parser,
        'ctrip': ctrip_parser.ctrip_parser,
        'elong': elong_parser.elong_parser,
        'expedia': expedia_parser.expedia_parser,
        'hotels': hotels_parser.hotels_parser,
        'hoteltravel': hoteltravel_parser.hoteltravel_parser,
        'hrs': hrs_parser.hrs_parser,
        'cheaptickets': expedia_parser.expedia_parser,
        'orbitz': expedia_parser.expedia_parser,
        'travelocity': expedia_parser.expedia_parser,
        'ebookers': expedia_parser.expedia_parser,
        'tripadvisor': tripadvisor_parser.tripadvisor_parser,
        'ctripcn': ctrip_cn_parser.ctrip_cn_parser,
        'hilton': hilton_parser.hilton_parser,
        'ihg': ihg_parser.ihg_parser,
        'holiday': holiday_parser.holiday_parser,
        'accor': accor_parser.accor_parser,
        'marriott': marriott_parser.marriott_parser
    }
    if source not in function_dict.keys():
        raise TypeCheckError(
            'Error Parser Source        with source %s   url %s ' %
            (source, url))

    parser = function_dict[source]
    result = parser(content, url, other_info)

    # key words check
    # logger.info('map_info  ++++++++    %s' % result.map_info)
    # if key_is_legal(result.map_info) and key_is_legal(result.address):
    if not key_is_legal(result.map_info):
        if retry_count > 3:
            if not key_is_legal(result.address):
                raise TypeCheckError(
                    'Error map_info and address NULL        with parser %ss    url %s'
                    % (parser.func_name, url))
            google_map_info = google_get_map_info(result.address)
            if not key_is_legal(google_map_info):
                raise TypeCheckError(
                    'Error google_map_info  NULL        with parser %ss    url %s'
                    % (parser.func_name, url))
            result.map_info = google_map_info
        else:
            raise TypeCheckError(
                'Error map_info NULL        with parser %ss    url %s' %
                (parser.func_name, url))

    if key_is_legal(result.hotel_name) or key_is_legal(result.hotel_name_en):
        logger.info(result.hotel_name + '  ----------  ' +
                    result.hotel_name_en)
    else:
        raise TypeCheckError(
            'Error hotel_name and hotel_name_en Both NULL        with parser %s    url %s'
            % (parser.func_name, url))

    if result.source == 'booking':
        # if not key_is_legal(result.hotel_name):
        #     raise TypeCheckError('booking has no hotel name        with parser %s    url %s' % (parser.func_name, url))
        # if not key_is_legal(result.hotel_name_en):
        #     raise TypeCheckError('booking has no hotel name en        with parser %s    url %s' % (parser.func_name, url))
        if not key_is_legal(result.img_items):
            raise TypeCheckError(
                'booking has no img        with parser %s    url %s' %
                (parser.func_name, url))

    if result.source == 'hotels':
        if not key_is_legal(result.img_items):
            raise TypeCheckError(
                'hotels has no img        with parser %s    url %s' %
                (parser.func_name, url))

    # if result.grade in ('NULL', '-1', ''):
    #     raise TypeError('Error Grade NULL')

    result.continent = part

    # expedia 五个源设置 source
    result.source = source

    # result 中 grade 修复
    if result.grade == 'NULL':
        result.grade = -1

    # 酒店全部字段繁体转简体
    keys = [
        'hotel_name', 'hotel_name_en', 'brand_name', 'address', 'service',
        'description', 'accepted_cards', 'check_in_time', 'check_out_time'
    ]

    for key in keys:
        if not getattr(result, key):
            setattr(result, key, 'NULL')
        try:
            setattr(result, key,
                    tradition2simple(getattr(result, key).decode()))
        except Exception as e:
            print('****', key, str(getattr(result, key)),
                  traceback.print_exc())
    return result
Exemple #5
0
    def _execute(self, **kwargs):
        target_url = self.task.kwargs['target_url']
        city_id = self.task.kwargs['city_id']
        poi_type = self.task.kwargs['poi_type']

        target_url = target_url.replace('.com.hk', '.cn')
        with MySession(need_cache=True) as session:
            page = session.get(target_url, timeout=120)
            page.encoding = 'utf8'

            parser = parser_type[poi_type]
            result = parser(page.content, target_url, city_id=city_id)

            if result == 'Error':
                raise ServiceStandardError(ServiceStandardError.PARSE_ERROR)

            result['city_id'] = city_id
            # result['utime'] = datetime.datetime.now()
            sql_key = result.keys()

            name = result['name']
            # if name.find('停业') > -1:
            #     raise ServiceStandardError(error_code=ServiceStandardError.TARGET_CLOSED)
            name_en = result['name_en']
            map_info = result['map_info']
            address = result['address']

            map_info_is_legal = True
            try:
                lon, lat = map_info.split(',')
                if float(lon) == 0.0 and float(lat) == 0.0:
                    map_info_is_legal = False
            except Exception as e:
                map_info_is_legal = False
                logger.exception(msg="[map info is not legal]", exc_info=e)

            if not key_is_legal(map_info) or not map_info_is_legal:
                if not key_is_legal(address):
                    pass
                    # raise TypeCheckError(
                    #     'Error map_info and address NULL        with parser %ss    url %s' % (
                    #         parser.func_name, target_url))
                google_map_info = google_get_map_info(address)
                if not key_is_legal(google_map_info):
                    pass
                    # raise TypeCheckError(
                    #     'Error google_map_info  NULL  with [parser: {}][url: {}][address: {}][map_info: {}]'.format(
                    #         parser.func_name, target_url, address, map_info)
                    # )
                result['map_info'] = google_map_info
            if key_is_legal(name) or key_is_legal(
                    name_en) or map_info_is_legal or key_is_legal(
                        result.introduction):
                logger.info(name + '  ----------  ' + name_en)
            else:
                raise TypeCheckError(
                    'Error All Keys is None with parser %s  url %s' %
                    (parser.func_name, target_url))

            try:
                session = DBSession()
                session.execute(
                    text(
                        text_2_sql(sql_key).format(
                            table_name=self.task.task_name)), [result])
                session.commit()
                session.close()
            except Exception as e:
                logger.exception(e)
                raise ServiceStandardError(
                    error_code=ServiceStandardError.MYSQL_ERROR,
                    wrapped_exception=e)

            self.task.error_code = 0
            return self.task.error_code
Exemple #6
0
#!/usr/bin/env python
# -*- coding: utf-8 -*-
# @Time    : 2017/10/10 下午9:42
# @Author  : Hou Rong
# @Site    :
# @File    : test_google_address.py
# @Software: PyCharm
import sys

sys.path.append('/data/lib')
from proj.my_lib.Common.NetworkUtils import google_get_map_info

if __name__ == '__main__':
    # res = google_get_map_info('Avenida San Martin 249, El Chalten Z9301ABA, Argentina')
    # print(res)
    print(google_get_map_info('US, CARMEL, CA'))
    print(google_get_map_info('US, CARMEL, CA, CARMEL, CA'))
Exemple #7
0
    def _execute(self, **kwargs):
        city_name = self.task.kwargs['city_name']
        _id = self.task.kwargs['id']
        country_code = self.task.kwargs['country_code']
        station_name = self.task.kwargs['station_name']

        headers = {
            # 'Accept': '*/*',
            # 'Accept-Encoding': 'gzip, deflate, br',
            # 'Accept-Language': 'zh-CN,zh;q=0.9',
            # 'Connection': 'keep-alive',
            # 'Content-Length': '650',
            'Content-Type': 'application/xml',
            # # 'Cookie': 'partition=REInc; key=8780929146871554943; RACSESSION=rachugr51ws1; JSESSIONID=1683475BCE9E12382BF194109FEF8EA2',
            # 'Cookie': 'partition=REInc; key=901203273551643828; RACSESSION=rachugr51ws1; JSESSIONID=10A4D0E06643F5521B2F56AF8925988D',
            # # 'Cookie': 'partition=REInc; key=8780929146871554943; RACSESSION=rachugr31ws1; JSESSIONID=39A85CACD869BEEBAC78C861A925B560',
            # 'Host': 'webservicesx.euronet.vsct.fr'
        }
        # data = 'action=Native&return_url_3dpayment=http%3A%2F%2F10.101.174.163%3A52514%2Fwsclient%2Fjsp%2Foutput.jsp&partition=REInc&xmldata=%3C%3Fxml+version%3D%221.0%22%3F%3E%0A%3Crequest+serviceName%3D%22StationCache%22%3E%0A%3Ckey%3E8780929146871554943%3C%2Fkey%3E%0A%3Clocale%3Een_US%3C%2Flocale%3E%0A%3Cstyle%3E%3C%2Fstyle%3E%0A%3CtravelAgency%3E%0A++%3CagencyType%3E%3C%2FagencyType%3E%0A++%3CagencyId%3E%3C%2FagencyId%3E%0A%3C%2FtravelAgency%3E%0A%3CcityCode%3E' + city_code + '%3C%2FcityCode%3E%0A%3CcacheTimestamp%3E%3C%2FcacheTimestamp%3E%0A%3C%2Frequest%3E&key=8780929146871554943&agencyType=&agencyId=&key=https%3A%2F%2Fwebservicesx.euronet.vsct.fr%2FV10%2Fwsclient'
        # data = 'action=Native&return_url_3dpayment=http%3A%2F%2F10.101.174.163%3A52514%2Fwsclient%2Fjsp%2Foutput.jsp&partition=REInc&xmldata=%3C%3Fxml+version%3D%221.0%22%3F%3E%0A%3Crequest+serviceName%3D%22StationCache%22%3E%0A%3Ckey%3E901203273551643828%3C%2Fkey%3E%0A%3Clocale%3Een_US%3C%2Flocale%3E%0A%3Cstyle%3E%3C%2Fstyle%3E%0A%3CtravelAgency%3E%0A++%3CagencyType%3E%3C%2FagencyType%3E%0A++%3CagencyId%3E%3C%2FagencyId%3E%0A%3C%2FtravelAgency%3E%0A%3CcityCode%3E' + city_code + '%3C%2FcityCode%3E%0A%3CcacheTimestamp%3E%3C%2FcacheTimestamp%3E%0A%3C%2Frequest%3E&key=901203273551643828&agencyType=&agencyId=&key=https%3A%2F%2Fwebservicesx.euronet.vsct.fr%2FV10%2Fwsclient'
        # url = 'https://webservicesx.euronet.vsct.fr/V10/wsclient/xml/results'
        # data = '<request serviceName="StationCache"><key>901203273551643828</key><locale>en_US</locale><style></style><travelAgency><agencyType></agencyType><agencyId></agencyId></travelAgency><cityCode>' + city_code + '</cityCode><cacheTimestamp></cacheTimestamp></request>'
        # url = 'https://ws-production.euronet.vsct.fr/V10/wsclient/xml'
        url = 'https://ws-production.euronet.vsct.fr/V10/webservices/xml/'

        # with MySession(need_proxies=False, need_cache=True) as session:
        #     res = session.post(url, headers=headers, data=data, timeout=240)
        #     print '*'*100
        #     print res.status_code
        #     print res.text
        #     logger.info(res.text)
        #     print '*' * 100
        #     # url1 = 'https://webservicesx.euronet.vsct.fr/V10/wsclient/cache' + res.content
        #     # url1 = 'https://ws-production.euronet.vsct.fr/V10/webservices/xml' + res.content
        #     # res1 = session.get(url1, headers=headers, verify=False, timeout=240)
        #
        #     # print '*' * 100
        #     # print res1.status_code
        #     # print res1.text
        #     # print '*' * 100
        #
        #     rule = '''<station><name>(.*?)</name><code>(.*?)</code><todCollectionAvailable>(.*?)</todCollectionAvailable></station>'''
        #     res2 = re.findall(rule, res.content)
        #     print '*' * 100
        #     print res2
        #     logger.info(str(res2))
        #     print '*' * 100
        # if len(res2) == 0:
        #     raise ServiceStandardError(error_code=ServiceStandardError.EMPTY_TICKET)

        # for sta in res2:
        # station_name = sta[0]
        # station_code = sta[1]
        google_map_info = google_get_map_info('{},{},{}'.format(country_code, city_name, station_name))
        google_city_map_info = google_get_map_info('{},{}'.format(country_code, city_name))
        # try:
        collections.update({
            '_id': ObjectId(_id)
        }, {'$set':{
                'map_info': google_map_info,
                # 'map_info': '',
                'city_map_info': google_city_map_info,
                # 'city_map_info': '',
            }
        })
        # except Exception:
        #     raise ServiceStandardError(error_code=ServiceStandardError.MYSQL_ERROR)

        self.task.error_code = 0
        return 'OK'