Beispiel #1
0
def hotel_base_data(self, source, url, other_info, **kwargs):
    x = time.time()
    PROXY = get_proxy(source="Platform")
    proxies = {'http': 'socks5://' + PROXY, 'https': 'socks5://' + PROXY}
    headers = {'User-agent': GetUserAgent()}

    try:
        page = requests.get(url, proxies=proxies, headers=headers, timeout=240)
        page.encoding = 'utf8'
        content = page.text
        # agoda 特殊情况 start
        url_about = 'https://www.agoda.com/NewSite/zh-cn/Hotel/AboutHotel?hotelId={0}&languageId=8&hasBcomChildPolicy=False'.format(
            other_info['source_id'])
        page_about = requests.get(url=url_about, headers=headers)
        page_about.encoding = 'utf8'
        about_content = page_about.text
        other_info['about_content'] = about_content

        # agoda end
        result = parse_hotel(content=content,
                             url=url,
                             other_info=other_info,
                             source=source)
        if not result:
            update_proxy('Platform', PROXY, x, '23')
            self.retry()
        else:
            update_task(kwargs['task_id'])
            print "Success with " + PROXY + ' CODE 0'
            update_proxy('Platform', PROXY, x, '0')
        return result
    except Exception as exc:
        update_proxy('Platform', PROXY, x, '23')
        self.retry(exc=traceback.format_exc(exc))
Beispiel #2
0
def hotel_static_base_data(self, parent_task_id, task_name, source, source_id,
                           city_id, hotel_url, **kwargs):
    logger.info("parent task id: {0}, start task".format(parent_task_id))
    self.task_source = source.title()
    self.task_type = 'HotelStaticDataParse'
    # 获取保存的页面信息
    other_info = {'source_id': source_id, 'city_id': city_id}
    logger.info(
        'http://10.10.180.145:8888/hotel_page_viewer?task_name=hotel_base_data_tripadvisor_total_new&id='
        + parent_task_id)

    content = get_page_content(task_id=parent_task_id, task_name=task_name)
    logger.info(
        "parent task id: {0}, end of get hotel content, start parse hotel".
        format(parent_task_id))
    result = parse_hotel(content=content,
                         url=hotel_url,
                         other_info=other_info,
                         source=source,
                         part=task_name)
    logger.info(
        "parent task id: {0}, end of parse hotel, start insert db".format(
            parent_task_id))

    if not result:
        raise Exception('db error')

    try:
        # logger.info(str(result))
        session = DBSession()
        session.merge(result)
        session.commit()
        session.close()
    except Exception as e:
        self.error_code = 33
        logger.exception(e)
        raise e
    logger.info("parent task id: {0}, end of insert db".format(parent_task_id))
    self.error_code = 0
    return result
Beispiel #3
0
def hotel_routine_base_data(self, source, url, other_info, **kwargs):
    self.task_source = source.title()
    self.task_type = 'Hotel'

    self.error_code = 0

    # 初始化任务
    try:
        # hotels
        if source == 'hotels':
            hotel_id = re.findall('hotel-id=(\d+)', url)[0]
            url = 'http://zh.hotels.com/hotel/details.html?hotel-id=' + hotel_id
    except Exception as e:
        self.error_code = 12
        logger.exception(e)
        raise e

    # 修改请求参数
    try:
        pass
    except Exception as e:
        self.error_code = 101
        logger.exception(e)
        raise e

    try:
        session = MySession()
        page = session.get(url, timeout=240)
        page.encoding = 'utf8'
        content = page.text
    except Exception as e:
        self.error_code = 22
        logger.exception(e)
        raise e

    try:
        result = parse_hotel(content=content,
                             url=url,
                             other_info=other_info,
                             source=source,
                             part="NULL")
    except TypeCheckError as e:
        self.error_code = 102
        logger.exception(e)
        raise e
    except Exception as e:
        self.error_code = 27
        logger.exception(e)
        raise e

    try:
        session = DBSession()
        session.merge(result)
        session.commit()
        session.close()
    except Exception as e:
        self.error_code = 33
        logger.exception(e)
        raise e

    try:
        # 保存抓取成功后的页面信息
        save_task_and_page_content(
            task_name='hotelinfo_routine_{0}'.format(source),
            content=content,
            task_id=kwargs['mongo_task_id'],
            source=source,
            source_id=other_info['source_id'],
            city_id=other_info['city_id'],
            url=url)
    except Exception as e:
        self.error_code = 104
        logger.exception(e)
        raise e
Beispiel #4
0
    def _execute(self, **kwargs):
        url = self.task.kwargs['url']
        source = self.task.kwargs['source']
        source_id = self.task.kwargs['source_id']
        city_id = self.task.kwargs['city_id']
        country_id = self.task.kwargs['country_id']
        hid = self.task.kwargs['hid']

        headers = {}
        other_info = {'source_id': source_id, 'city_id': city_id}

        if source in ['starwood', 'hyatt', 'gha', 'shangrila', 'fourseasons']:
            error_code, res, page_store_key_list = hotel_detail_database(
                url, source)

            if error_code == 0:
                result = parse_hotel_info(res)
            else:
                raise ServiceStandardError(error_code=error_code)
        else:
            with MySession(need_cache=True) as session:

                # booking start
                if source == 'booking':
                    headers['Referer'] = 'http://www.booking.com'

                # booking end

                session.headers.update(headers)
                start = time.time()
                if source not in ('hilton', 'ihg', 'holiday', 'accor',
                                  'marriott'):
                    page = session.get(url, timeout=240)
                    page.encoding = 'utf8'
                    content = page.text
                elif source == 'ihg':
                    url1, url2 = url.split('#####')
                    page1 = session.get(url1, timeout=240)
                    page1.encoding = 'utf8'
                    content1 = page1.text

                    page2 = session.get(url2, timeout=240)
                    page2.encoding = 'utf8'
                    content2 = page2.text

                    content = [content1, content2]
                elif source == 'holiday':
                    url2, url1 = url.split('#####')
                    page1 = requests.get(
                        url1,
                        headers={
                            'x-ihg-api-key':
                            'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y',
                            'ihg-language': 'zh-CN'
                        },
                        timeout=240)
                    page1.encoding = 'utf8'
                    content1 = page1.text

                    page2 = requests.get(
                        url2,
                        timeout=240,
                        headers={
                            'accept': 'application/json, text/plain, */*',
                            'Content-Type': 'application/json; charset=UTF-8',
                            'user-agent':
                            'Mozilla/5.0 (Macintosh; Intel Mac OS X 10_10_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36',
                            'ihg-language': 'zh-CN',
                        })
                    page2.encoding = 'utf8'
                    content2 = page2.text

                    page3 = requests.get(url1,
                                         headers={
                                             'x-ihg-api-key':
                                             'se9ym5iAzaW8pxfBjkmgbuGjJcr3Pj6Y'
                                         },
                                         timeout=240)
                    page3.encoding = 'utf8'
                    content3 = page3.text

                    content = (content1, content2, content3)
                elif source == 'accor':
                    proxy_url = "http://10.10.239.46:8087/proxy?source=pricelineFlight&user=crawler&passwd=spidermiaoji2014"
                    r = requests.get(proxy_url)
                    proxies = {'https': "socks5://" + str(r.text)}
                    headers = {
                        "User-Agent":
                        "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_11_6) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"
                    }
                    page = requests.get(url,
                                        headers=headers,
                                        verify=False,
                                        proxies=proxies)
                    page.encoding = 'utf8'
                    content = page.text
                elif source == 'marriott':
                    url_list = url.split('#####')
                    url = url_list[0]

                    for i in url_list:
                        if len(i.split('=')) > 1:
                            key, value = i.split('=')[0], i.split('=')[1]
                            if key == 'longtitude':
                                other_info['longtitude'] = value
                            if key == 'latitude':
                                other_info['latitude'] = value
                        else:
                            if url_list.index(i) == 1:
                                other_info['hotel_name_en'] = i

                    url2 = url.replace("travel", "hotel-photos")
                    url3 = url.replace("travel/", "maps/travel/")
                    url4 = url.replace("hotels/", "hotels/fact-sheet/")
                    headers = {
                        'User-Agent':
                        'Mozilla/5.0 (Macintosh; Intel Mac OS X 10.11; rv:47.0) Gecko/20100101 Firefox/47.0'
                    }
                    if "https://www.marriott.com" in url:
                        page1 = requests.get(url, headers=headers, timeout=240)
                        page2 = requests.get(url2,
                                             headers=headers,
                                             timeout=240)
                        page3 = requests.get(url3,
                                             headers=headers,
                                             timeout=240)
                        page4 = requests.get(url4,
                                             headers=headers,
                                             timeout=240)

                        page1.encoding = 'utf8'
                        page2.encoding = 'utf8'
                        page3.encoding = 'utf8'
                        page4.encoding = 'utf8'

                        content1 = page1.text
                        content2 = page2.text
                        content3 = page3.text
                        content4 = page4.text
                        content = (content1, content2, content3, content4)
                    else:
                        url2 = url + "/hotel-overview"
                        page1 = requests.get(url, headers=headers, timeout=240)
                        page2 = requests.get(url2,
                                             headers=headers,
                                             timeout=240)
                        page1.encoding = 'utf8'
                        page2.encoding = 'utf8'
                        content1 = page1.text
                        content2 = page2.text
                        content = (content1, content2)
                else:
                    session.auto_update_host = False
                    hilton_index = url.find('index.html')
                    if hilton_index > -1:
                        url = url[:hilton_index]
                    split_args = url.split('/')
                    detail_url = 'http://www3.hilton.com/zh_CN/hotels/{0}/{1}/popup/hotelDetails.html'.format(
                        split_args[-3], split_args[-2])
                    map_info_url = url + 'maps-directions.html'
                    desc_url = url + 'about.html'

                    page = session.get(url)
                    map_info_page = session.get(map_info_url)
                    desc_page = session.get(desc_url)

                    detail_page = session.get(detail_url, )
                    page.encoding = 'utf8'
                    detail_page.encoding = 'utf8'
                    map_info_page.encoding = 'utf8'
                    desc_page.encoding = 'utf8'
                    __content = page.text
                    logger.info(detail_url)
                    __detail_content = detail_page.text
                    __map_info_content = map_info_page.text
                    __desc_content = desc_page.text

                    content = [
                        __content, __detail_content, __map_info_content,
                        __desc_content
                    ]
                logger.debug("[crawl_data][Takes: {}]".format(time.time() -
                                                              start))

                start = time.time()
                result = parse_hotel(content=content,
                                     url=url,
                                     other_info=other_info,
                                     source=source,
                                     part=self.task.task_name,
                                     retry_count=self.task.used_times)
                logger.debug("[parse_hotel][func: {}][Takes: {}]".format(
                    parse_hotel.func_name,
                    time.time() - start))

        try:
            data_collections = mongo_data_client['ServicePlatform'][
                self.task.task_name]
            data_collections.create_index([('source', 1), ('source_id', 1)],
                                          unique=True,
                                          background=True)
            data_collections.create_index([('location', '2dsphere')],
                                          background=True)
            tmp_result = deepcopy(result.values(backdict=True))
            lon, lat = str(result.map_info).split(',')
            lon, lat = float(lon), float(lat)
            tmp_result.update(
                {'location': {
                    'type': "Point",
                    'coordinates': [lon, lat]
                }})
            data_collections.save(tmp_result)
        except pymongo.errors.DuplicateKeyError:
            # logger.exception("[result already in db]", exc_info=e)
            logger.warning("[result already in db]")
        except Exception as exc:
            raise ServiceStandardError(
                error_code=ServiceStandardError.MONGO_ERROR,
                wrapped_exception=exc)

        start = time.time()
        try:
            service_platform_conn = service_platform_pool.connection()
            cursor = service_platform_conn.cursor()
            others_info = json.loads(result.others_info)
            others_info['hid'] = hid
            result.others_info = json.dumps(others_info)
            sql = result.generation_sql()
            sql = sql.format(table_name=self.task.task_name)
            values = result.values()
            self.logger.info(result.__dict__)
            cursor.execute(sql, values)
            service_platform_conn.commit()
            cursor.close()
            service_platform_conn.close()
        except Exception as e:
            logger.exception(e)
            raise ServiceStandardError(
                error_code=ServiceStandardError.MYSQL_ERROR,
                wrapped_exception=e)

        logger.debug("[Insert DB][Takes: {}]".format(time.time() - start))
        self.task.error_code = 0
        return self.task.error_code