Exemple #1
0
def fetch_price_by_hotel(oid, checkin, checkout, days):
    """Fetch price from booking.cn."""
    hotel = databases('hub').poi_items.find_one({'_id': ObjectId(oid)})

    booking_com_id = hotel.get('booking_com_id')
    if booking_com_id:
        logger.error('Can not be here.')
        return
    else:
        name = hotel.get('name')
        city_id = hotel.get('city')
        city_name = (databases('hub')
                     .meta_cities
                     .find_one({'_id': city_id})
                     .get('name'))

    resp = requests.get(
        settings.PARITY,
        params={
            'checkin': checkin,
            'checkout': checkout,
            'hotel': name,
            'city': city_name
        }
    )
    assert resp.status_code == 200, resp.text
    data = resp.json()
    assert data['status'] == 200, data
    try:
        price = data['prices'][0]['price']
    except (IndexError, KeyError) as exc:
        logger.warning(f'Hotel({name}) not found. {data}', exc_info=exc)
        return
    if '元' in price:
        currency = 'CNY'
    elif '¥' in price:
        currency = 'CNY'
    elif '€' in price:
        currency = 'EUR'
    elif '$' in price:
        currency = 'USD'
    else:
        raise TypeError(price)

    price = Decimal(PRE.search(price).group(1).replace(',', ''))
    if currency != 'CNY':
        exchange_rate = get_exchange_rate(currency, 'CNY')
        price *= exchange_rate

    return push_to_influxdb(oid, hotel['name'], checkin, checkout, days, price)
Exemple #2
0
def find_hotel_name(provider, hotel_id):
    if provider == "cms":
        hub = databases("hub")
        data = hub["poi_items"].find_one({"_id": ObjectId(hotel_id)}, {
            "name_en": "1",
            "city": "1"
        })
        if not data:
            logger.info(f"invalid cms_id : {hotel_id}")
            return False
        city = hub["meta_cities"].find_one({"_id": data.get("city", "")},
                                           {"name_en": "1"})
        if not city or not city.get("name_en", ""):
            logger.info(
                f"invalid city_id of hotel: {hotel_id}, {data.get('city')}")
            city_name = ""
        else:
            city_name = city["name_en"]
        return f"{data.get('name_en')} {city_name}"
    quoter_coll = settings.SUPPLIER_ID_2_COLL.get(
        provider) or settings.SUPPLIER_NAME_2_COLL.get(provider)
    if not quoter_coll:
        return False
    if quoter_coll == "wg_hotel":
        db = databases("whotel")
    else:
        db = databases("scripture")
    condition = {"$or": []}
    try:
        int_hid = int(hotel_id)
        condition["$or"].append({"hotel_id": int_hid})
    except Exception:
        pass
    try:
        str_hid = str(hotel_id)
        condition["$or"].append({"code": str_hid})
        condition["$or"].append({"hotel_id": str_hid})
    except Exception:
        pass
    hotel_msg = db[quoter_coll].find_one(condition, {
        "name": "1",
        "province": "1"
    })
    if not hotel_msg:
        return False
    hotel_name = (
        f"{hotel_msg['name'].replace('&', ' ')} {hotel_msg.get('city', {'name': ''})['name'] or hotel_msg.get('province', '')}"
    )
    return hotel_name
Exemple #3
0
def send_tf_id_errormsg():
    '''
    查询出历史TF酒店ID配错或已失效的数据
    统计完成后从表中删除
    '''
    db = databases('scripture')
    data = db['taskmsg.availability'].find({"type": "travflex_id_error"})
    text = "# 有线上酒店填写的Travflex酒店ID已失效 \n\n"
    nums = 0
    last_updated_time = None
    for e in data:
        if not last_updated_time or e['updated_at'] > last_updated_time:
            last_updated_time = e['updated_at']
        ids.append(e['hotel_id'])
        text += f"- cms链接: http://wop.feifanweige.com/admin/hotels/{e['hotel_id']}\n"
        nums += 1
    payload = {
        "msgtype": "markdown",
        "markdown": {
            "title": "有线上酒店填写的Travflex酒店ID已失效",
            "text": text,
        },
    }
    resp = requests.post(
            "https://oapi.dingtalk.com/robot/send",
            params=Ding_params,
            json=payload,
        )
    db['taskmsg.availability'].remove({"type": "travflex_id_error", "updated_at": {"$lte": last_updated_time}})
Exemple #4
0
def db_update_type(cms_id, checkin, price_type, prices):
    db = databases('scripture')
    condition = {'hotel_id': cms_id, 'prices.checkin': checkin}
    upload = {"$currentDate": {"updated_at": True}}
    price = prices['price']
    if price_type == -1 or not price or isinstance(price, str):
        upload['$set'] = {'prices.$.bug_price_type': -1}
    else:
        upload['$set'] = {'prices.$': prices}
        old_price = db['statics.hotels.prices'].find_one({'hotel_id': cms_id, 'prices.checkin': checkin}, {'prices.$'})
        # 正常不会出现此种情况,仅在发布任务的5分钟后原数据仍未更新到数据库中时才会出现,出现则抛弃此次查询结果
        if not old_price:
            logger.warning(f"bug_price_check_task_before_data_insert: {cms_id}, {checkin}")
            return
        thre_price = old_price['prices'][0].get('bug_thre_price', 0)
        if price > thre_price:
            upload['$set']['prices.$']['bug_price_type'] = 0
        else:
            upload['$set']['prices.$']['bug_price_type'] = 1
        upload['$set']['prices.$']['bug_thre_price'] = thre_price

    res = db['statics.hotels.prices'].update_one(
        condition,
        upload
    )
Exemple #5
0
def update_bug_price_type(cms_id, checkin):
    hub = databases('hub')
    hotel = hub["poi_items"].find_one(
        {"_id": ObjectId(cms_id)},
        {"quote_ids": "1", "min_booking_days": "1", "city": "1"},
    )
    min_booking_days = int(hotel.get("min_booking_days", 1))
    checkout = (datetime.strptime(checkin, "%Y-%m-%d") + timedelta(days=min_booking_days)).strftime("%Y-%m-%d")
    payload = dict(
        checkin=checkin,
        checkout=checkout,
        roomfilters=[{"adults": 2}],
        quoters=[
            {"quoter": str(value["quoter"]), "hotel_id": value["hotel_id"]}
            for value in hotel["quote_ids"]
            if value["hotel_id"].strip() != ""
        ],
    )
    try:
        # 每个任务仅查询一个酒店的一个日期,只一次网络IO
        resp = requests.post(
            quotes_api, headers={"x-query-from": "robot"}, json=payload
        )
        res = resp.json()
    except Exception as exc:
        logger.error(
            f"{checkin} {hotel['_id']} get price faild!\ndetail : {resp.content}",
            exc_info=exc,
        )
        return db_update_type(cms_id, checkin, -1, {})

    if not res or res["status"] != 200 or not res["data"].get("categorized"):
        return db_update_type(cms_id, checkin, -1, {})
    _min_price_room = list(res["data"]["categorized"].values())[0][0]
    supplier_rooms = find_each_min_supplier(res["data"]["categorized"])
    _min_price = float(_min_price_room.get("total_price", 9999999))
    _min_supplier = _min_price_room.get("identity", {}).get(
        "provider", "Unknown"
    )
    _city_rate = hub['meta_cities'].find_one({"_id": hotel['city']}, {'tax_rate': '1'})
    if not _city_rate:
        city_rate = 0.05
    else:
        city_rate = float(_city_rate.get('tax_rate', 0.05))
    without_tax_price = math.ceil(_min_price * (1-city_rate))
    prices = {
        "checkin": checkin,
        "checkout": checkout,
        "price": _min_price,
        'without_tax_price': without_tax_price,
        "ori_price": float(
            _min_price_room.get("ori_total_price_cny", 9999999999999)
        ),
        "supplier": _min_supplier,
        "room_type_en": _min_price_room.get("room_type", ""),
        "room_type_cn": _min_price_room.get("translation", ""),
        "each_supplier": supplier_rooms,
        "updated_at": datetime.now(),
    }
    return db_update_type(cms_id, checkin, 1, prices)
Exemple #6
0
 def room_types(self):
     with requests.Session() as sess:
         params = {
             "userName": self.user,
             "password": self.pswd,
             "language": "en"
         }
         resp = sess.get(f"{self.room_meal_entry_point}/GetRoomTypes",
                         params=params)
         root = ET.fromstring(resp.content.decode('utf-8'))
         scripture = databases("scripture")
         for child in root[0]:
             scripture['statics.hotels.jactravel.roomTypes'].update_one(
                 {'code': str(child[0].text)},
                 {
                     '$set': {
                         'code': str(child[0].text),
                         'id': str(child[0].text),
                         'room_type': str(child[1].text),
                         'sharedRoom': str(child[2].text),
                         'sharedFacilities': str(child[3].text),
                     },
                     "$setOnInsert": {
                         "created_at": datetime.now()
                     },
                     "$currentDate": {
                         "last_modified": True
                     },
                 },
                 upsert=True,
             )
Exemple #7
0
def calendar_all():
    hub = databases("hub")
    max_num = hub["feature_switch"].find_one(
        {"table": "Activity"}, {"config": "1"}
    )
    try:
        max_num = max_num["config"]["max_price_calendar_hotel"]
    except Exception as exc:
        logger.error(f"cannot get max_num ! ", exc_info=exc)
        exit()
    try:
        resp = requests.get(
            f"{settings.CMS_API}/api/internal/configs/hotel",
            params={"configs": "price_calendar_display_day_span"},
            headers={"accept-version": "6.0.0"},
        )
        if resp and resp.status_code == 200:
            days = resp.json()["data"].get(
                "price_calendar_display_day_span", 130
            )
    except Exception as exc:
        logger.error(f"get calendar days error!", exc_info=exc)
        days = 130
    for hotel in (
        hub["poi_items"]
        .find(
            {"has_price_calendar": True},
            {"quote_ids": "1", "min_booking_days": "1"},
        )
        .sort([("updatedAt", -1)])
        .limit(max_num)
    ):
        calendar_one.delay(str(hotel["_id"]), days=days)
def packages():
    hub = databases("hub")

    packages = hub.sku_packages.find(
        {
            "$and": [
                {
                    "edit_status": {
                        "$in": ["edited", "audited"]
                    }
                },
                {
                    "publish_status": {
                        "$ne": "offline"
                    }
                },
                {
                    "has_relevant_hotel": True
                },
            ]
        },
        {
            "hotels": 1,
            "daily_inventory": 1,
            "air_price": 1,
            "appreciation_fee": 1,
            "inventory_updated_at": 1,
        },
    )

    return (package_check.chunks(
        [[json.dumps(pkg, default=on_json_serialize)] for pkg in packages],
        16,
    )
            | on_finished.s()).apply_async()
Exemple #9
0
 def meal_types(self):
     with requests.Session() as sess:
         params = {
             "userName": self.user,
             "password": self.pswd,
             "language": "en"
         }
         resp = sess.get(f"{self.room_meal_entry_point}/GetMeals",
                         params=params)
         root = ET.fromstring(resp.content.decode('utf-8'))
         scripture = databases("scripture")
         for child in root[0]:
             datas = {
                 'id': child[0].text,
                 'code': str(child[0].text),
                 'meal_type': child[1].text,
                 'lables': []
             }
             for lable in child[2]:
                 datas['lables'].append({
                     'lable_id': lable[0].text,
                     'lable_content': lable[1].text
                 })
             scripture['statics.hotels.jactravel.mealTypes'].update_one(
                 {'code': datas['code']}, {
                     '$set': datas,
                     '$setOnInsert': {
                         'created_at': datetime.now()
                     },
                     "$currentDate": {
                         "last_modified": True
                     }
                 },
                 upsert=True)
Exemple #10
0
def news_with_loc(loc: str) -> bool:
    """fetch google news by location"""
    headers = {
        'User-Agent':
        ('Mozilla/5.0 (X11, Linux x86_64) AppleWebKit 537.36 '
         '(KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36'),
    }
    session = requests.Session()
    session.proxies = settings.PROXIES
    resp = session.get(
        'https://news.google.com/news/rss/local/section/geo/{}'.format(loc),
        params={
            'ned': 'us',
            'hl': 'en'
        },
        headers=headers,
        timeout=5,
    )
    if resp.status_code != 200:
        logger.warning('Bad response %s %s %s', resp.status_code, resp.url,
                       resp.reason)

    scripture = databases('scripture')
    hub = databases('ai')
    Story.__db_table__ = scripture.cp_stories
    AINews.__db_table__ = hub.ai_news
    feed = feedparser.parse(resp.text)
    for entry in feed['entries']:
        story = Story(entry)
        city, country_code = loc.rsplit(',', 1)
        story['city'] = city.strip()
        story['country_code'] = country_code.strip()
        published_at = arrow.get(story['published'], 'D MMM YYYY HH:mm:ss ZZZ')
        story['published_at'] = published_at.datetime
        story['scope'] = 'google'
        try:
            ai_story = AINews.from_story(**story.save())
            ai_story.save()
            logger.info('Successful to save %s to db', story['id'])
            logger.debug('Story %s', story)
        except Exception as exc:  # pylint: disable=W0703
            logger.error("%s", story)
            logger.exception(exc)
    return True
def hotel_matching(collection_name, query):
    scripture = databases("scripture")
    # collections = [scripture.bookings, scripture.hotels, scripture.ctrips]
    matching = Matching()
    crawled_hotel = scripture.get_collection(collection_name).find_one(
        json.loads(query),
        no_cursor_timeout=True
    )
    matching.one(crawled_hotel, collection_name)
    return True
Exemple #12
0
def news_with_topic(topic: str) -> bool:
    """fetch google news by topic"""
    headers = {
        'User-Agent':
        ('Mozilla/5.0 (X11, Linux x86_64) AppleWebKit 537.36 '
         '(KHTML, like Gecko) Chrome/59.0.3071.86 Safari/537.36'),
    }
    session = requests.Session()
    session.proxies = settings.PROXIES
    url = 'https://news.google.com/news/rss/headlines/section/topic/' + \
        topic.upper()
    resp = session.get(url,
                       params={
                           'ned': 'us',
                           'hl': 'en'
                       },
                       headers=headers,
                       timeout=5)
    if resp.status_code != 200:
        logger.warning('Bad response %s %s %s', resp.status_code, resp.url,
                       resp.reason)

    scripture = databases('scripture')
    hub = databases('ai')
    Story.__db_table__ = scripture.cp_stories
    AINews.__db_table__ = hub.ai_news

    feed = feedparser.parse(resp.text)
    for entry in feed['entries']:
        story = Story(entry)
        story['scope'] = 'google'
        story['topic'] = topic
        published_at = arrow.get(story['published'], 'D MMM YYYY HH:mm:ss ZZZ')
        story['published_at'] = published_at.datetime
        ai_story = AINews.from_story(**story)
        try:
            ai_story = AINews.from_story(**story.save())
            ai_story.save()
            logger.info('Successful to save %s to db', story)
        except Exception as exc:  # pylint: disable=W0703
            logger.exception(exc)
    return True
Exemple #13
0
def hotel_check():
    hub = databases("hub")
    scripture = databases("scripture")
    scripture["hotel.online.check"].update_one({"__t": "flag"},
                                               {"$set": {
                                                   "refreshing": True
                                               }},
                                               upsert=True)
    base_day = datetime.now()
    onlines = []
    for online in hub["poi_items"].find(
        {
            "__t": "Hotel",
            "edit_status": {
                "$in": ["edited", "audited"]
            },
            "publish_status": "online",
        },
        {
            "_id": "1",
            "quote_ids": "1",
            "name": "1",
            "name_en": "1",
            'address': '1',
            'en.address': '1'
        },
    ):
        onlines.append((online, base_day))
    for available in executer.map(check_price, onlines):
        if available:
            available["updated_at"] = datetime.now()
            scripture["hotel.online.check"].update_one(
                {"hotel_id": available["_id"]},
                {"$set": available},
                upsert=True,
            )
    scripture["hotel.online.check"].update_one({"__t": "flag"},
                                               {"$set": {
                                                   "refreshing": False
                                               }},
                                               upsert=True)
Exemple #14
0
def save_order_message(order_message: Dict, email: str) -> Dict[str, Any]:
    """Save parsed message to mongo
    """
    if not order_message or \
            not isinstance(order_message, dict) or \
            'message_id' not in order_message:
        logger.debug(order_message)
        raise Ignore(order_message)
    scripture = databases('scripture')
    u_result = scripture.g_orders.update_one(
        {
            'email': email,
            'message_id': order_message['message_id']
        }, {
            '$set': order_message,
            '$setOnInsert': {
                'created_at': datetime.now()
            },
            '$currentDate': {
                'updated_at': True
            }
        },
        upsert=True)

    scripture.g_users.update_one(
        {'email': email},
        {'$currentDate': {
            'last_fetched_at': True,
            'updated_at': True
        }})

    modified = u_result.raw_result.get('nModified') == 1 and \
        u_result.raw_result.get('ok') == 1

    sent_to_ai = False
    if u_result.upserted_id is not None:
        ai_endpoint = f'{ai_base}{mapping[order_message["category"]]}'
        headers = {'cache-control': 'no-cache'}
        data = order_message.copy()
        data['capture_id'] = u_result.upserted_id
        data['email'] = email
        resp = requests.post(ai_endpoint, data=data, headers=headers)
        sent_to_ai = True
        if resp.status_code != 200:
            sent_to_ai = False
            logger.error('Failed when sent to ai: %s', resp.text)

    return {
        'modified': modified,
        'is_updated': u_result.upserted_id is None,
        'is_inserted': u_result.upserted_id is not None,
        'sent_to_ai': sent_to_ai
    }
Exemple #15
0
def _get_sem_map():
    db = databases("scripture")
    sem = db.statics.sem.find()
    result = {}
    for item in sem:
        result[f'{item["sem_name"]}={item["sem_id"]}'] = {
            "unit": item["unit"],
            "plan": item["plan"],
            "keywords": item["keywords"],
        }

    return result
Exemple #16
0
def incremental_fetch(email: str) -> bool:
    """fetch more email"""
    scripture = databases('scripture')
    user = scripture.g_users.find_one({'email': email, 'authenticated': True})

    if not user:
        return False  # TODO: Add Dingtalk notify of other imformations

    do_request.apply_async(
        (email, user['access_token']),
        # TODO: only token, pop user's info
        link=dispatcher.s(email=email, token=user, uid=user['id']))

    return True
Exemple #17
0
def dispatcher(messages: Dict[str, Any], email: str, uid: str, token: str):
    """Dispatch request and response
    Args:
        messages: List
        email: string
        token: sting
    Returns:
        None
    Raises:
        Ignore
    """

    if len(messages['messages']) < 1:
        raise Ignore(None)

    save_cb = save_order_message.s(email=email)
    parse_order_message_cb = parse_order_message.s(email=email, uid=uid)
    parse_order_message_cb.link(save_cb)
    is_order_message_cb = is_order_message.s()
    is_order_message_cb.link(parse_order_message_cb)
    # do_request_s = do_request.s(token=token)
    group([
        do_request.signature(
            (f'users/{email}/messages/{message["id"]}', token),
            link=is_order_message_cb
        )
        for message in messages['messages'][:-1]
    ]) \
        .apply_async()
    last_message = messages['messages'][-1]
    result = do_request(f'users/{email}/messages/{last_message["id"]}',
                        token=token)

    scripture = databases('scripture')

    last_fetched_at = scripture.g_users \
        .find_one({'email': email}) \
        .get('last_fetched_at')

    if must_request_next_page(result, last_fetched_at):
        params = {'pageToken': messages['nextPageToken']}
        do_request.apply_async((f'users/{email}/messages/', token, params),
                               link=dispatcher.s(email=email,
                                                 uid=uid,
                                                 token=token))
    else:
        incremental_fetch.apply_async((email, ),
                                      eta=datetime.now() + timedelta(days=1))

    is_order_message.apply_async((result, ), link=parse_order_message_cb)
Exemple #18
0
    def multi(self):
        """TODO: Docstring for function.

        Args:
            arg1 (TODO): TODO

        Returns: TODO

        """
        scripture = databases("scripture")
        collections = [scripture.bookings, scripture.hotels, scripture.ctrips]
        for collection in collections:
            for crawled_hotel in collection.find(no_cursor_timeout=True):
                self.one(crawled_hotel, collection.name)
Exemple #19
0
def _update_sem(sem_file_path):
    db = databases("scripture")
    db.statics.sem.remove()
    with open(sem_file_path, "r") as f:
        sem = json.load(f)
        c = [{
            "sem_id": k.split("=")[1],
            "sem_name": k.split("=")[0],
            "unit": sem[k]["unit"],
            "plan": sem[k]["plan"],
            "keywords": sem[k]["keywords"],
        } for k in sem]

        db.statics.sem.insert_many(c)
        logger.info("Inserted %s", db.statics.sem.count_documents({}))
Exemple #20
0
def get_skyscanner(start_time, days, sid=None, hotel_id=None, hotel_name=None):
    if not hotel_id and not hotel_name:
        logger.info(f"skyscanner withou hotel_id and name! sid: {sid}")
        return False
    if not sid:
        if hotel_id:
            hub = databases("hub")
            ori_sid = hub["poi_items"].find_one(
                {"_id": ObjectId(hotel_id)}, {"third_ref_ids": "1"}
            )
            if not ori_sid.get("third_ref_ids"):
                logger.info(
                    f"hotel without skyscanner id! hotel_id : {hotel_id}"
                )
                return False
            return crawl_one(
                start_time,
                days,
                ori_sid["third_ref_ids"][0]["value"],
                hotel_id,
                hotel_name,
            )
        scripture = databases("scripture")
        ori_sid = scripture["statics.hotels.skyscanner"].find_one(
            {"name": {"$regex": hotel_name}}, {"sid": "1"}
        )
        if not ori_sid:
            logger.info(
                f"hotel_name not find in skyscanner datas! hotel_name : {hotel_name}"
            )
            return False
        return crawl_one(
            start_time, days, ori_sid["sid"], hotel_id, hotel_name
        )
    else:
        return crawl_one(start_time, days, sid, hotel_id, hotel_name)
Exemple #21
0
def booking_com():
    """从booking.cn抓取酒店价格."""
    packages = databases('hub').sku_packages.find()
    checkin = datetime.now() + timedelta(days=15)
    for pkg in packages:
        for htl in pkg['hotels']:
            oid = str(htl['hotel'])
            days = htl['days']
            for interval in range(10):
                checkin += timedelta(days=interval)
                checkout = checkin + timedelta(days=days)
                fetch_price_by_hotel.delay(
                    oid,
                    checkin.strftime('%Y-%m-%d'),
                    checkout.strftime('%Y-%m-%d'),
                    days
                )
def compare_data(new_prices,checkin_str,cms_id,url):
    scripture = databases('scripture')
    db_prices = scripture.statics.booking.prices.find_one(
        {"cms_id": cms_id, 'prices.checkin': checkin_str},
        {
            'prices.$': 1
        }
    )
    if not db_prices and new_prices:
        logger.error(f'爬虫失效/数据库失连,数据库cms_id:{cms_id},目标url:{url}')
        title = 'booking_prices爬虫失效/数据库失连'
        text = f'## [告警]booking爬虫失效\n,数据库cms_id:{cms_id},目标url:{url}'
        dingding(title, text)
    elif db_prices and not new_prices:
        logger.error(f'比价模块失效,数据库cms_id:{cms_id},目标url:{url}')
        title = 'booking_prices比价模块失效'
        text = f'## [告警]booking比价模块失效\n,数据库cms_id:{cms_id},目标url:{url}'
        dingding(title, text)
    elif db_prices and new_prices:
        db_prices = db_prices.get('prices')[0].get('prices')
        new_prices_info = {}
        for one_room_dict in new_prices:
            room_info_hash = Simhash(
                f'{one_room_dict["occupancy"]}{one_room_dict["room_type"]}{one_room_dict["policies"]}'
            ).value
            new_prices_info[room_info_hash] = one_room_dict["price"]
        for one_room_dict in db_prices:
            room_info_hash = Simhash(
                f'{one_room_dict["occupancy"]}{one_room_dict["room_type"]}{one_room_dict["policies"]}'
            ).value
            db_one_room_price = one_room_dict.get('price')
            new_one_room_price = new_prices_info.get(room_info_hash)
            if not new_one_room_price:
                logger.info(f'{kwargs}在{checkin_str}的房型({one_room_dict["room_type"]})已售出')
            if compare_price(db_one_room_price, new_one_room_price):
                _info = {
                    'url': url,
                    'checkin': checkin_str,
                    'room_type': one_room_dict["room_type"],
                    'mongodb_price': db_one_room_price,
                    'celery_task_get_price': new_one_room_price
                }
                title = 'booking_prices异常'
                text = f'## [告警]booking抓取价格异常\n{_info}'
                dingding(title, text)
Exemple #23
0
def make_requests():
    """dispatch request by localtion or topic"""
    scripture = databases('scripture')

    avalible_topics = ('technology', 'business', 'entertainment', 'sports',
                       'science', 'health')
    for topic in avalible_topics:
        news_with_topic.apply_async([topic], time_limit=5, soft_time_limit=3)

        gevent.sleep(1)

    cursor = scripture.countries \
        .find({'population': {'$gte': 10000}}, no_cursor_timeout=True) \
        .sort('population', -1)
    for loc in cursor:
        location = '{}, {}'.format(loc['ascii_name'], loc['country_code'])
        news_with_loc.apply_async([location], time_limit=5, soft_time_limit=3)

        gevent.sleep(1)

    return True
Exemple #24
0
def get_booking_url(provider, hotel_id=None, hotel_name=None):
    if provider == "cms":
        hub = databases("hub")
        data = hub["poi_items"].find_one(
            {
                "_id": ObjectId(hotel_id),
                "crawl_info.crawl_website": "bk_url"
            },
            {"crawl_info.$": "1"},
        )
        if data and data.get("crawl_info"):
            return str(URL(data["crawl_info"][0]["crawl_url"]).path)
    if not hotel_name:
        hotel_name = find_hotel_name(provider, hotel_id)
        if not hotel_name:
            logger.info(f"not find hotel_name with {provider}, {hotel_id}")
            return False
    query_url = (
        f"https://www.booking.com/searchresults.zh-cn.html?ss={hotel_name}")
    resp = requests.get(
        query_url,
        headers={
            "User-Agent":
            "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_13_5) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/70.0.3538.102 Safari/537.36"
        },
    )
    if resp.status_code != 200:
        # logger.error(f'hotel: {hotel_id}, {hotel_name} get url failed.')
        return False
    et = etree.HTML(resp.content.decode("utf-8"))
    try:
        hotel_path = (et.xpath('//a[@class="hotel_name_link url"]/@href')
                      [0].strip().split("?")[0])
        return hotel_path
    except Exception as exc:
        logger.error(
            f"hotel: {hotel_id}, {hotel_name} get url in {query_url} failed.",
            exc_info=exc,
        )
        return False
Exemple #25
0
 def _set_relationships(self, collection, object_id, rel_collection,
                        rel_object_id):
     agent = databases("agent")
     has_relationship = agent.get_collection(collection).find_one({
         "relation_to_crawled.rel_collection":
         rel_collection,
         "relation_to_crawled.rel_object_id":
         rel_object_id,
     })
     if has_relationship:
         return True
     updated = agent.get_collection(collection).update_one(
         {"_id": object_id},
         {
             "$push": {
                 "relation_to_crawled": {
                     "rel_collection": rel_collection,
                     "rel_object_id": rel_object_id,
                 }
             }
         },
     )
     return updated.raw_result
Exemple #26
0
        'color': 'yellow'
    },
    'error': {
        'color': 'red'
    },
    'critical': {
        'bold': True,
        'color': 'red'
    }
}
coloredlogs.install(level='DEBUG',
                    isatty=True,
                    level_styles=LEVEL_STYLE,
                    fmt='%(asctime)s %(name)s %(levelname)s %(message)s')

AGENT = databases('agent')
auth = oss2.Auth(settings.OSS_ACCESS_KEY_ID, settings.OSS_SECRET_ACCESS_KEY)
OSS = oss2.Bucket(auth, settings.OSS_ENDPOINT, settings.OSS_BUCKET)


def statistic_cdn_image(provider: enum.Enum) -> Tuple[int, int, str]:
    """ 上传document中cdn_images上传失败的图片,并统计供应商图片个数、成功个数、成功率

    Args:
       provider: 供应商
    Returns:
       图片总数,图片上传成功个数,成功率,cdn_images上传成功率
    """
    count = 0
    success = 0
Exemple #27
0
from functools import partial

from bson import ObjectId
from celery.utils.log import get_task_logger
from tasks.application import app  # noqa
from tasks.utils.database import databases  # noqa
# First Party
from tasks import settings
from tasks.supplier_statics.hotel_name import fetch_ctrip_name
from tasks.supplier_statics.postal_code import get_province_by_postal_code
from tasks.supplier_statics.supplier_images import ImageSaver
from tasks.utils.notifiers import DingtalkMessage, DingtalkNotifier
from tasks.errors import NotifyFailed
from tasks import settings

DB = databases("scripture")
HUB = databases("hub")
key_list = ["latitude", "longitude", 'telephone', "website"]


class Providers(enum.Enum):
    bonotel = "bonotel"
    roomsxml = "roomsxml"
    hotelbeds = "hotelbeds"
    hotelspro = "hotelspro"
    jactravel = "jactravel"
    relux = "relux"
    relux_rooms = 'relux.rooms'


class BaseSupplier(object):
Exemple #28
0
def check_preparation(
    hotels, start_time=None, end_time=None, max_days=None, min_booking_days=1
):
    """
    hotels: [
        {
            'provider': Provider.provider or 'cms' or 'providers'
            'hotel_id': hotel_id or cms_id or id1::provider_id1;id2::provider_id2
        }
    ]
    """
    hub = databases("hub")
    for hotel in hotels:
        if hotel["provider"] == "cms":
            data = hub["poi_items"].find_one(
                {"_id": ObjectId(hotel["hotel_id"])},
                {"quote_ids": "1", "min_booking_days": "1"},
            )
            if not data:
                logger.error(f"not find cms hotel with {hotel['hotel_id']}!")
                continue
            p_hotel = {
                "id": hotel["hotel_id"],
                "hotels": [
                    {"quoter": str(e["quoter"]), "hotel_id": e["hotel_id"]}
                    for e in data["quote_ids"]
                ],
            }
            preparation_one.delay(
                hotel=p_hotel,
                start_time=start_time,
                end_time=end_time,
                max_days=max_days,
                min_booking_days=data.get("min_booking_days", 1),
            )
        elif hotel["provider"] == "providers":
            p_hotel = {"id": hotel["hotel_id"], "hotels": []}
            for _ in hotel["hotel_id"].split(";"):
                hotel_id, provider = _.split("::")
                provider = settings.SUPPLIER_NAME_2_ID.get(provider, provider)
                p_hotel["hotels"].append(
                    {"quoter": provider, "hotel_id": hotel_id}
                )
            preparation_one.delay(
                hotel=p_hotel,
                start_time=start_time,
                end_time=end_time,
                max_days=max_days,
                min_booking_days=min_booking_days,
            )
        else:
            provider = settings.SUPPLIER_NAME_2_ID.get(
                hotel["provider"], hotel["provider"]
            )
            p_hotel = {
                "id": f"{hotel['hotel_id']}::{provider}",
                "hotels": [
                    {"quoter": provider, "hotel_id": hotel["hotel_id"]}
                ],
            }
            preparation_one.delay(
                hotel=p_hotel,
                start_time=start_time,
                end_time=end_time,
                max_days=max_days,
                min_booking_days=min_booking_days,
            )
    return "preparation check publish succeed"
from functools import lru_cache

# First Party
import yaml
import pinyin
from pysolr import Solr
from pymongo import MongoClient
from tasks.utils.database import databases
from bson import ObjectId

# Current Project
import requests

# from solrcloudpy import SolrConnection

db = databases("agent")
hub_db = databases("hub")

solr_host = "172.16.1.223"


def push_hotels(cursor, supplier):
    solr = Solr(f"http://{solr_host}/solr/hotels")
    docs = []
    index = 1
    total = cursor.count()
    for doc in cursor:
        d = {
            "id": str(doc["_id"]),
            "name": doc["name"],
            "name_cn": doc.get("name_cn", ""),
Exemple #30
0
        else:
            self.logger.critical("city name missing: city code(%s) ",
                                 hotel["destination"])
        country = self.table("countries").find_one({"code": hotel["country"]})
        doc["country"] = {"code": hotel["country"]}
        if country:
            doc["country"]["name"] = country.get("name")
        else:
            self.logger.critical("country name missing: country code(%s) ",
                                 hotel["country"])
        if doc["regions"]:
            doc["province"] = self.get_province_by_region_codes(
                doc["regions"]) or ""
        if hotel["images"]:
            doc["images"] = [
                img["original"] for img in hotel["images"]
                if img.get("original")
            ]

        doc["updated_at"] = datetime.strptime(hotel["updated_at"],
                                              "%Y-%m-%dT%H:%M:%S.%fZ")
        if 'code' not in doc:
            doc['code'] = str(doc.get('hotel_id', ''))
        return doc


if __name__ == "__main__":
    from tasks.utils.database import databases

    HotelsPro(databases("scripture")).regions()