async def add_compare_prices(compare, uid, hotel_prices, start_time, end_time): """ """ db = databases("scripture") coll = f"statics.{compare}.prices" for hotel in uid.split(";"): quoter_id, hotel_id = hotel.split('::') hotel = f"{hotel_id}::{quoter_id}" hotel_data = await db[coll].find_one({"cms_id": hotel}) if not hotel_data: hotel_data = {} compare_prices = { price["checkin"]: price for price in hotel_data.get("min_price", []) or hotel_data.get('prices', []) if price and isinstance(price, dict) and start_time <= datetime.strptime(price["checkin"], "%Y-%m-%d") <= end_time } for checkin in hotel_prices.keys(): if checkin not in compare_prices: hotel_prices[checkin][compare] = '' continue hotel_prices[checkin][compare] = compare_prices.get(checkin, {}).get('price', '') hotel_prices[checkin][f"{compare}_room"] = compare_prices.get(checkin, {}).get('room_type', '') or compare_prices.get(checkin, {}).get('room_type_cn', '') return hotel_prices
async def _booking(request): valid = await validate_request(request) if valid["errmsg"]: return rest_result(request, {"status": 400, "errmsg": valid["errmsg"]}) cms_ids = valid["hotels"] db = databases("hub") start_time = valid["start_time"] end_time = valid["end_time"] days = valid["days"] # 避免传入的开始时间早于当前日期导致无效查询 # TODO: 抽象成单独校验日期的方法 for index, hid in enumerate(cms_ids): booking_url = await db["poi_items"].find_one( { "_id": ObjectId(hid), "crawl_info.crawl_website": "bk_url" }, {"crawl_info.$": "1"}, ) if not booking_url: continue await get_booking_prices(booking_url["crawl_info"][0]["crawl_url"], hid, start_time, days) calendar_one.delay(hid, start_time, end_time) if index % 10 == 0: _check_prices.apply_async( kwargs={ "base_url": booking_url["crawl_info"][0]["crawl_url"], "cms_id": hid, "start_time": start_time, "days": days, }, countdown=settings.CHECK_PRICE_DELAY_TIME, ) return rest_result(request, {"status": 200, "data": "ok"})
async def crawl_booking(cid, url): logger = logging.getLogger(__name__) scripture = databases("scripture") statics_data = await scripture["capture_urls"].find_one( {"_id": ObjectId(cid)}) quoter = statics_data.get("quoter") hid = statics_data.get("hid") payload = paras_booking_payload(url) if quoter and hid: where_clouser = {"quoter": quoter, "hid": hid} bookings_id = f"{quoter}::{hid}" else: where_clouser = {"capture_urls_id": cid} bookings_id = cid payload["capture_urls_id"] = cid res = await scripture.bookings.update_one(where_clouser, {"$set": payload}, upsert=True) if res: await scripture.capture_urls.update_one( {"_id": statics_data["_id"]}, { "$set": { "_hotels_cn_id": statics_data.get("hotels_cn_id"), "hotels_cn_id": "", "bookings_id": bookings_id, "jset_id": "", "_jset_id": statics_data.get("jset_id"), } }, ) logger.info(f'cid:{cid},url:{url},update statics data success') return {"status": 200, "data": payload} else: logger.error(f'cid:{cid},url:{url},update statics data faild') return {"status": 500, "errmsg": f"update statics data faild!"}
async def get_city_name(city_id): hub = databases("hub") name = await hub["meta_cities"].find_one({"_id": city_id}, {"name_en": "1"}) if name: return name["name_en"] else: return ""
async def refresh_datas(request): logger = logging.getLogger(__name__) db = databases("scripture") is_refreshing = await db["hotel.online.check"].find_one( {"__t": "flag"}, {"refreshing": "1"}) if not is_refreshing or not is_refreshing.get("refreshing"): hotel_online_check.delay() logger.info(f'数据更新') return html("""</a>数据更新中,请稍后刷新页面下载excel""")
async def orders(request, email): """List orders """ SORT = { # pylint: disable=C0103 '-1': -1, '1': 1, 'DESC': -1, 'ASC': 1, 'desc': -1, 'asc': 1, -1: -1, 1: 1 } scripture = databases('scripture') query = {'email': email} u = await scripture.g_users.find_one({'email': email}) # noqa pylint: disable=C0103 if not u: return json( { 'error': 'User is not authenticated, please authenticate first', 'status': 403 }, status=403) if not u.get('authenticated'): return json({ 'error': 'User authenticate is expired!', 'status': 401 }, status=401) sort = SORT.get(request.args.get('sort', 'DESC').upper()) limit = int(request.args.get('limit', 20)) page_number = int(request.args.get('page_number', 0)) filters = request.args.get('filters') if filters: query['type'] = {'$in': filters} cursor = scripture.g_orders \ .find(query) \ .sort('created_at', sort) \ .skip(limit * page_number) \ .limit(limit) return json( jsonutil.dumps([order async for order in cursor], ensure_ascii=False, default=str))
async def _hotels_details(request): db = request.args.get("db") _id = request.args.get("_id") scripture = databases("scripture") if db == "hotels": hotels_info = await scripture[db].find_one({"_id": ObjectId(_id)}) elif db == "bookings": hotels_info = await scripture[db].find_one({"_id": ObjectId(_id)}) else: hotels_info = {"status": 400, "errmsg": "params invalid"} logger.info(f'{request}_id :{_id },response_dict{hotels_info}') return rest_result(request, hotels_info)
async def start_compair(websites, compair): rds = databases(settings.REDIS) to_redis = json.dumps({ 'spider_name': 'compair', 'websites': websites, 'compair': compair }) try: await rds.lpush('distributed_spider', to_redis) return True except Exception as exc: logger.warning(f"redis 异常", exc_info=exc) return False
async def hotel(hotel_ids): hotel_mapping = {} for item in hotel_ids: provider, _id = item.split(":") hotel_mapping.setdefault(provider, []).append(ObjectId(_id)) docs = [] for provider, ids in hotel_mapping.items(): async for hotel in (databases("agent").get_collection(provider).find( {"_id": { "$in": ids }})): docs.append(HotelDoc(hotel, provider)) return await solr_add(docs, "hotels")
async def search(request): logger = logging.getLogger(__name__) p = request.form.get('partial', '').lower() scripture = databases('scripture') cursor = scripture.statics.hotels.hotelbeds.find( {"$text": { "$search": f'"{p}"' }}) hotels = [formatted(hotel) async for hotel in cursor] style_classes = 'table table-bordered table-hover' table = json2html.convert( hotels, table_attributes=f'id="info-table" class="{style_classes}"') logger.info(f'partial:{p} response sucess') return html(HTML.format(p, table))
async def start_crawl_hcom(request): logger = logging.getLogger(__name__) db = databases("scripture") wb = xlwt.Workbook(encoding="utf-8") st = wb.add_sheet("sheet1") st.write(0, 0, "酒店网址") st.write(0, 1, "酒店中文名称") st.write(0, 2, "酒店英文名称") st.write(0, 3, "酒店中文地址") st.write(0, 4, "酒店英文地址") row = 1 base_day = datetime.now().strftime("%Y-%m-%d") base_day = datetime.strptime(base_day, "%Y-%m-%d") is_refreshing = await db["hotel.online.check"].find_one( {"__t": "flag"}, {"refreshing": "1"}) hotels = [] async for hotel in db["hotel.online.check"].find( {"updated_at": { "$gte": base_day }}): hotels.append(hotel) if not hotels: await db["hotel.online.check"].update_one( {"__t": "flag"}, {"$set": { "refreshing": True }}) hotel_online_check.delay() logger.info(f'base_day:{base_day},数据更新') return html("""</a>数据更新中,请稍后刷新页面下载excel""") for online in hotels: st.write(row, 0, f"https://flashtrip.cn/hotels/{online.get('_id', '')}") st.write(row, 1, online.get("name", "")) st.write(row, 2, online.get("name_en", "")) st.write(row, 3, online.get('address', "")) st.write(row, 4, online.get('en', {}).get('address', '')) row += 1 excel = BytesIO() wb.save(excel) excel.seek(0) logger.info(f'下载酒店信息成功,filename={base_day}上线状态酒店.xls') return raw( excel.getvalue(), headers={ "Content-Disposition": f"attachment;filename={base_day}上线状态酒店.xls" }, content_type="application/vnd.ms-excel", )
async def save_user_record(cms_id, stage, price, checkin, room_type, checkout, meal_type, is_package, user_id, source, cancel_policy, user_ip, source_type, voucher, deal_check_code): db = databases("scripture") payload = { "cms_id": cms_id, "stage": stage, "checkin": checkin, "checkout": checkout, "is_package": is_package, "user_id": user_id, "query_time": datetime.now(), 'source': source, 'source_type': source_type, 'user_ip': user_ip, 'created_at': datetime.now(), 'weego_price': price, 'meal_type': meal_type, 'room_type': room_type, 'cancel_policy': cancel_policy, } if stage == "availability": if price: payload['weego_price'] = float(price[0]['price']) payload['meal_type'] = price[0].get('meal_type', '') payload['cancel_policy'] = price[0].get('cancel_policy', '') payload['room_type'] = price[0].get('room_type', '') payload['weego_availability'] = price else: payload['weego_price'] = '当日无报价' payload['meal_type'] = '' payload['cancel_policy'] = '' payload['weego_availability'] = [] payload['room_type'] = '' elif stage == "booking": payload['deal_check_code'] = deal_check_code payload['voucher'] = voucher else: # preparation or cancellation logger.debug(f"stage: {stage}") # 目前除下订和取消外,传入的都是单价,在此处更新为总价 if isinstance(payload['weego_price'], float) and stage not in ['booking', 'cancellation']: book_day = (datetime.strptime(payload['checkout'], '%Y-%m-%d') - datetime.strptime(payload['checkin'], '%Y-%m-%d')).days payload['weego_price'] *= book_day res = await db["compair"].insert_one(payload) return res._InsertOneResult__inserted_id, payload
async def supplier_data(supplier, hotel_id): logger = logging.getLogger(__name__) coll = settings.SUPPLIER_NAME_2_COLL[settings.SUPPLIER_ID_2_NAME[supplier]] if coll == 'wg_hotel': db = databases('whotel') else: db = databases("scripture") query = settings.SUPPLIER_QUERY[settings.SUPPLIER_ID_2_NAME[supplier]] query["code"] = "1" condition = [{'code': hotel_id}, {'hotel_id': str(hotel_id)}] try: int_hotel_id = int(hotel_id) condition.append({"code": int_hotel_id}) condition.append({"hotel_id": int_hotel_id}) except Exception : pass s_data = await db[coll].find_one( {'$or': condition} , query ) if not s_data: logger.info(f'supplier:{supplier},hotel_id:{hotel_id} can not query s_data') return {} logger.info(f'supplier:{supplier},hotel_id:{hotel_id} query s_data success') return formatter_statics_data(settings.SUPPLIER_ID_2_NAME[supplier], s_data)
async def update() -> None: logger = logging.getLogger(f'{__name__}.update') db = databases("hub") query = { "__t": "Hotel", "edit_status": { "$in": ["edited", "audited"] }, "publish_status": "online" } count = await db.poi_items.count_documents(query) progress = 0 async for doc in db.poi_items.find(query, {'facilities': 1}): logger.info('progress: {:.2%}'.format(progress / count)) progress += 1 facilities = doc.get('facilities') if not facilities: logger.warning(f'No facilities: {doc["_id"]}') continue try: facilities = ensure_json(facilities) except ValueError as exc: logger.error(f'{exc}\n _id: {doc["_id"]}') continue except Exception as exc: logger.critical(f'Unknown_error, _id: {doc["_id"]}', exc_info=exc) continue try: data = await fetch_links(facilities) except AssertionError as exc: logger.error(f'Status unexpected: {exc}, _id: {doc["_id"]}') continue except Exception as exc: logger.critical(f'Unknown_error, _id: {doc["_id"]}', exc_info=exc) continue try: updated_facilities = ensure_bson(data, {"_id": ObjectId}) except Exception as exc: logger.critical(f'Unknown_error, _id: {doc["_id"]}', exc_info=exc) continue future = asyncio.ensure_future( db.poi_items.update_one( {'_id': doc['_id']}, {'$set': { 'facilities': updated_facilities }})) future.add_done_callback(partial(callback, oid=doc["_id"]))
async def start_ta_spider(city, country, allow_num, lost_city, filter_or_not): rds = databases(settings.REDIS) to_redis = json.dumps({ 'city': city, 'country': country, 'allow_num': allow_num, 'lost_city': lost_city, 'filter_or_not': filter_or_not, 'spider_name': 'tripadvisor' }) try: await rds.lpush('distributed_spider', to_redis) return True except Exception as exc: # logger.warning(f"redis 异常", exc_info=exc) return False
async def get_supplier(): hub = databases('hub') hotels = [] city_map = {} country_map = {} async for hotel in hub['poi_items'].find( { "__t": "Hotel", "edit_status": { "$in": ["edited", "audited"] }, "publish_status": "online" }, { 'name': '1', 'address': '1', 'name_en': '1', 'quote_ids': '1', 'city': '1' }): if hotel['city'] not in city_map: city_name = await hub['meta_cities'].find_one( {'_id': hotel['city']}, { 'name': '1', 'country': '1' }) if not city_name: city_map[hotel['city']] = '已删除' country_map[hotel['city']] = '已删除' else: city_map[hotel['city']] = city_name['name'] country_name = await hub['meta_countries'].find_one( {"_id": city_name['country']}, {'name': '1'}) country_map[hotel['city']] = country_name['name'] line = { 0: str(hotel['_id']), 1: hotel['name'], 2: hotel['name_en'], 3: hotel['address'], 4: city_map[hotel['city']], 14: country_map[hotel['city']] } for quote in hotel['quote_ids']: line[settings.SUPPLIER_ID_2_INDEX[str(quote['quoter'])]] = str( quote['hotel_id']) hotels.append(line) return hotels
async def refresh_access_token(unused_request, email): """Refresh access token""" scripture = databases('scripture') u = await scripture.g_users.find_one({'email': email}) # noqa pylint: disable=C0103 g = GMailClient( # pylint: disable=C0103 client_id=settings.GOOGLE_OAUTH_CLIENT_ID, client_secret=settings.GOOGLE_OAUTH_CLIENT_SECRET, access_token=u['access_token'], request_params=request_params) try: _, data = await g.refresh_access_token(u['refresh_token']) unused_user, u_info = await g.user_info() data.update(u_info) data['authenticated'] = True await scripture.g_users.update_one({'email': email}, { '$set': data, '$currentDate': { 'updated_at': True, 'token_refreshed_at': True } }) except Exception as e: # pylint: disable=W0703,C0103 logger.exception(e) await scripture.g_users.update_one({'email': email}, { '$set': { 'authenticated': False }, '$currentDate': { 'updated_at': True, 'token_refreshed_at': True } }) return json({'text': False}, status=401) return json(data)
async def hotel_filter(extra_condition, selected): hub = databases("hub") condition = { "__t": "Hotel", "edit_status": { "$in": ["edited", "audited"] }, "publish_status": "online", } condition.update(extra_condition) select_hotel = set() if selected: condition["_id"] = {"$in": [ObjectId(_id) for _id in selected]} async for hotel in hub["poi_items"].find(condition, {"_id": "1"}): select_hotel.add(str(hotel["_id"])) if selected and select_hotel: hotels = selected and select_hotel else: hotels = selected or select_hotel return hotels
async def refresh_price_calendar(request): logger = logging.getLogger(__name__) body = request.json if not body: logger.warning("request has not body") return rest_result(request, { "status": 400, "errmsg": "body is empty!" }) hotel_id = body.get("hotel_id", "") if not hotel_id: logger.warning("hotel_id cannot be None") return rest_result(request, { "status": 400, "errmsg": "hotel_id cannot be None" }) db = databases("scripture") status = await db["statics.hotels.prices"].find_one( {"hotel_id": hotel_id}, {"selecting", "updated_at"}) if (status and status.get("selecting") and status.get("updated_at", datetime.now() + timedelta(days=-1)) < datetime.now() + timedelta(days=-1)): logger.info(f"hotel_id:{hotel_id},Refreshing") return rest_result(request, {"status": 200, "data": "Refreshing..."}) if body.get("bug_price", False): if body.get("days"): bug_price_one.delay(hotel_id, int(body["days"])) else: bug_price_one.delay(hotel_id) logger.info(f"bug price publish") else: calendar_one.delay(hotel_id) await db["statics.hotels.prices"].update_one({"hotel_id": hotel_id}, {"$set": { "selecting": True }}) logger.info(f"hotel_id:{hotel_id}, publish task succeed") return rest_result(request, {"status": 200, "data": "succeed"})
async def record_quotes(request): request.headers['accept'] = 'application/json' hub = databases('hub') body = request.json if not body: return rest_result(request, { 'status': 400, 'errmsg': 'body must be non-empty!' }) body['query_time'] = datetime.utcfromtimestamp(int(body['time_stamp'])) cms_id = body.get('cms_hotel_id') if not cms_id: logger.error(f"invalid request without cms_id\n{body}") return rest_result(request, { 'status': 400, 'errmsg': 'cms_id: {cms_id} illegal.' }) else: logger.info(f"save pull_down data with {body}") cms = await hub['poi_items'].find_one({"_id": ObjectId(cms_id)}, {'city': '1'}) if not cms.get('city'): logger.error(f"invalid cms id : {cms_id} without city") return if cms['city'] not in city_maps: city = await hub['meta_cities'].find_one({"_id": cms['city']}, {'name': '1'}) if not city: logger.error(f"") body['city'] = 'Unknown' else: city_maps[cms['city']] = city['name'] body['city'] = city_maps[cms['city']] body.pop('time_stamp', '') resp = await pulldown.create(**body) return rest_result(request, {'status': 200, 'data': resp.to_dict()})
async def skyscanner(request): valid = await validate_request(request) if valid["errmsg"] and not request.json.get("provider"): return rest_result(request, {"status": 400, "errmsg": valid["errmsg"]}) start_time = valid["start_time"] days = valid["days"] for hotel in valid["hotels"]: get_skyscanner.delay(start_time, days, hotel_id=hotel) db = databases("scripture") for hotel_id, hotel_name in request.json.get("provider", {}).items(): sid = await db["statics.hotels.skyscanner"].find_one( {"name": { "$regex": hotel_name.lower(), "$options": "i" }}, {"sid": "1"}, ) if sid: get_skyscanner.delay(start_time, days, sid=sid["sid"], hotel_id=hotel_id) return rest_result(request, {"status": 200, "data": "ok"})
async def update_premium(data, fir_query): logger = logging.getLogger(__name__) db = databases(settings.REDIS) i = 0 while True or i > 20: compare_msg = await db.get(fir_query) if not compare_msg: time.sleep(5) i += 1 continue data.update(json.loads(compare_msg)) await async_price_compare_check(data) # 将数据传给溢价系统,供动态溢价使用 # async with aiohttp.ClientSession() as sess: # async with sess.post( # f"", # headers={}, # json=data # ) as res: # ... break logger.info(f"{fir_query} send data to premium succeed!") return
async def get_booking_prices(url, cms_id, start_time, days, spider_name='booking_prices', **kwargs): rds = databases(settings.REDIS) _url = URL(url) if _url.host not in ['www.booking.com', 'm.ctrip.com']: logger.info(f"invalid params : {url}") return False base_url = f"{_url.scheme}://{_url.host}{_url.path}" try: days = int(days) checkin = datetime.strptime(start_time, '%Y-%m-%d') checkin = checkin.strftime('%Y-%m-%d') except Exception as exc: logger.info(f"invalid params", exc_info=exc) return False if not 0 < days < 91: logger.info(f"invalid params : {days}") return False to_redis = json.dumps({ 'spider_name': spider_name, 'base_url': base_url, 'cms_id': cms_id, 'start_time': checkin, 'days': days, **kwargs, }) try: await rds.lpush('distributed_spider', to_redis) return True except Exception as exc: logger.warning(f"redis 异常", exc_info=exc) return False
async def destination(destnation_ids, type_name="destinations"): type_mapping = { "destinations": { "type_name": "destination", "type_code": 8 }, "cities": { "type_name": "city", "type_code": 16 }, "provinces": { "type_name": "province", "type_code": 32 }, "countries": { "type_name": "country", "type_code": 64 }, } docs = [] awaitable_cursor = (databases("agent").get_collection( "statics.{}".format(type_name)).find( {"_id": { "$in": [ObjectId(_id) for _id in destnation_ids] }}, { "country_id": 1, "province_id": 1, "city_id": 1, "name_cn": 1, "name_en": 1, "name_alts": 1, "weight": 1, }, )) async for destination in awaitable_cursor: doc = { "name_en": destination["name_en"], "name_cn": destination.get("name_cn", ""), "type": type_mapping[type_name]["type_name"], "type_code": type_mapping[type_name]["type_code"], "id": str(destination["_id"]), "weight": destination["weight"], "hotel_count": destination.get("hotel_count", 0), } if "name_alts" in destination: name_alts = destination["name_alts"].split(",") else: name_alts = [] if doc["name_en"]: name_alts.append(doc["name_en"]) if doc["name_cn"]: name_alts.append(doc["name_cn"]) doc["name_alts"] = name_alts country_id = destination.get("country_id") province_id = destination.get("province_id") city_id = destination.get("city_id") if country_id: country = await get_country_by_id(country_id) or { "name_cn": "", "name_en": "", } # doc["country_id"] = str(country_id) doc["country_name_cn"] = country.get("name_cn", "") doc["country_name_en"] = country["name_en"] if province_id: province = await get_province_by_id(province_id) or { "name_cn": "", "name_en": "", } # doc["province_id"] = str(province_id) doc["province_name_cn"] = province.get("name_cn", "") doc["province_name_en"] = province["name_en"] if city_id: city = await get_city_by_id(city_id) or { "name_cn": "", "name_en": "", } # doc["city_id"] = str(city_id) doc["city_name_cn"] = city.get("name_cn", "") doc["city_name_en"] = city["name_en"] docs.append(doc) return await solr_add(docs, "destinations")
async def index(request, email): """Trigger parse gmail""" access_token = request.form.get('access_token') if not access_token: return json({ 'status': 400, 'error': 'access_token must be provided!' }, status=400) refresh_token = request.form.get('refresh_token').strip('"').strip("'") if not refresh_token: return json({ 'status': 400, 'error': 'refresh_token must be provided!' }, status=400) id_token = request.form.get('id_token') expires_in = int(request.form.get('expires_in', 3599)) token_type = request.form.get('token_type') scripture = databases('scripture') scripture.g_users.update_one( {'email': email}, { '$set': { 'access_token': access_token, 'refresh_token': refresh_token, # 'id_token': id_token, # 'expires_in': expires_in, # 'token_type': token_type, 'email': email }, '$currentDate': { 'updated_at': True }, '$setOnInsert': { 'created_at': datetime.now() }, }, upsert=True) g = GMailClient( # pylint: disable=C0103 client_id=settings.GOOGLE_OAUTH_CLIENT_ID, client_secret=settings.GOOGLE_OAUTH_CLIENT_SECRET, access_token=access_token, request_params=request_params) _, data = await g.user_info() uri = f'users/{email}/messages' token = { 'access_token': access_token, 'refresh_token': refresh_token, 'id_token': id_token, 'expires_in': expires_in, 'token_type': token_type } gmail.do_request.apply_async((uri, token), link=gmail.dispatcher.s(email=email, uid=data['id'], token=token)) gmail.refresh_access_token.apply_async((email, ), countdown=expires_in) return json(data)
async def update_hotel(request): logger = logging.getLogger(__name__) try: body = request.json except exceptions.InvalidUsage: body = None if not body or not isinstance(body, dict): logger.warning(f"Invalid request body: {request.body}") raise exceptions.InvalidUsage({ "status": 400, "errmsg": "Invalid request body" }) oid = body.get("_id") if not oid: logger.warning('_id is required!') raise exceptions.InvalidUsage({ "status": 400, "errmsg": "_id is required!" }) db = databases("scripture") doc = await db.statics.hotels.relux.rooms.find_one({"_id": ObjectId(oid)}, {"rooms_cn": 1}) if not doc: logger.warning(f'oid:{oid} Corresponding Hotel not found') raise exceptions.NotFound({ "status": 404, "errmsg": "Hotel not found!" }) ori_rooms_cn = doc.get("rooms_cn") if not ori_rooms_cn: logger.warning(f'oid:{oid} Corresponding Hotel rooms_cn not found') raise exceptions.NotFound({ "status": 404, "errmsg": "Hotel rooms_cn not found!" }) rooms_cn = body.get("rooms_cn") if not rooms_cn: logger.warning(f'oid:{oid} Corresponding rooms_cn is required') raise exceptions.InvalidUsage({ "status": 400, "errmsg": "rooms_cn is required!" }) futures = [] for room, ori_room in zip(rooms_cn, ori_rooms_cn): room_id = room["id"] if room_id != ori_room["id"]: logger.error(f"The order of the rooms is out of order: " f"room({room}), ori_doc({doc})") continue if room == ori_room: continue for plan, ori_plan in zip(room.get("plans", []), ori_room.get("plans", [])): if plan["id"] != ori_plan["id"]: logger.error(f"The order of the plans is out of order: " f"plan({plan}), ori doc({doc})") continue if plan == ori_plan: continue if plan["name"] != ori_plan["name"]: plan["ori_name"] = ori_plan["name"] if plan["feature"] != ori_plan["feature"]: plan["ori_feature"] = ori_plan["feature"] future = asyncio.ensure_future( db.statics.hotels.relux.rooms.update_one( { "_id": ObjectId(oid), "rooms_cn.id": room_id, "rooms_cn.plans": ori_room["plans"] }, {"$set": { "rooms_cn.$.plans": room["plans"] }})) extra = { "oid": oid, "ori_plan": ori_room["plans"], "new_plan": room["plans"] } future.add_done_callback(partial(callback, extra=extra)) futures.append(future) if not futures: logger.warning( f'ori_room["plans"]:{ori_room["plans"]},room["plans"]:{room["plans"]},' f' No difference or rooms order is wrong') raise exceptions.InvalidUsage({ "status": 400, "errmsg": "No difference or rooms order is wrong" }) logger.info(f'oid:{oid} update_relux_plan success') return response.json({"status": 200, "data": {"count": len(futures)}})
async def extract_data(start_time, end_time, stages=[]): logger = logging.getLogger(__name__) db = databases("scripture") hub = databases("hub") condition = {"stage": {"$in": []}, "created_at": {}} for stage in stages: if stage in settings.STAGES: condition["stage"]["$in"].append(stage) else: logger.error(f"invalid stage: {stage}!") if isinstance(start_time, str): # 兼容scripture-views if "T" in start_time: start_time = datetime.strptime(start_time, "%Y-%m-%dT%H:%M") else: start_time = datetime.strptime(start_time, "%Y-%m-%d %H:%M:%S") condition["created_at"]["$gte"] = start_time if isinstance(end_time, str): if "T" in end_time: end_time = datetime.strptime(end_time, "%Y-%m-%dT%H:%M") else: end_time = datetime.strptime(end_time, "%Y-%m-%d %H:%M:%S") # 服务器为UTC时间,比北京时间慢8小时,传入时间需-8 condition["created_at"]["$lte"] = end_time result = {"hotel_count": {}, "city_count": {}, "user_count": {}} cms_msgs = {} async for data in db["compair"].find(condition): if data['cms_id'] not in cms_msgs: cms_msg = await hub["poi_items"].find_one( {"_id": ObjectId(data["cms_id"])}, { "name": "1", "name_en": "1", "city": "1", "address": "1" }, ) cms_msgs[data['cms_id']] = cms_msg cms_msg = cms_msgs[data['cms_id']] if not cms_msg: logger.error(f"{data['cms_id']} not find!") continue if cms_msg["city"] not in city_map: city_msg = await hub["meta_cities"].find_one( {"_id": cms_msg["city"]}, {"name": "1"}) if not city_msg: city_map[cms_msg["city"]] = "该城市已被删除" else: city_map[cms_msg["city"]] = city_msg["name"] city_name = city_map[cms_msg["city"]] city_count_key = f"{city_name}:{data['stage']}" if city_count_key not in result["city_count"]: result["city_count"][city_count_key] = { "count": 0, "stage": data["stage"], } result["city_count"][city_count_key]["count"] += 1 data["city_name"] = city_name data["name_cn"] = cms_msg["name"] data["name_en"] = cms_msg["name_en"] data["address"] = cms_msg["address"] # 服务器为UTC时间,比北京时间慢8小时,展示时间需+8 if isinstance(data["query_time"], str): if "." in data["query_time"]: data["query_time"] = str( datetime.strftime( datetime.strptime(data["query_time"], "%Y-%m-%d %H:%M:%S.%f"), "%Y-%m-%d %H:%M:%S", )) else: data["query_time"] = str( datetime.strptime(data["query_time"], "%Y-%m-%d %H:%M:%S")) else: data["query_time"] = str(data["query_time"]) del data['_id'] del data['created_at'] hotel_count_key = f"{data['cms_id']}:{data['stage']}" if hotel_count_key not in result["hotel_count"]: result["hotel_count"][hotel_count_key] = { "count": 0, "data": data, "stage": data["stage"], } result["hotel_count"][hotel_count_key]["count"] += 1 # 数据量过大且暂时无需求,不再返回 # result["info"].append(data) user_count_key = f"{data['user_id']}" if not user_count_key.strip(): continue if user_count_key not in result['user_count']: result['user_count'][user_count_key] = { "count": 0, "availability": { "count": 0, "hotel": {} }, "preparation": { "count": 0, "hotel": {} }, "booking": { "count": 0, "hotel": {} }, "cancellation": { "count": 0, "hotel": {} }, } if data['cms_id'] not in result['user_count'][user_count_key][ data['stage']]['hotel']: result['user_count'][user_count_key][data['stage']]['hotel'][ data['cms_id']] = 0 result['user_count'][user_count_key][data['stage']]['hotel'][ data['cms_id']] += 1 result['user_count'][user_count_key]['count'] += 1 result['user_count'][user_count_key][data['stage']]["count"] += 1 return result
async def list_website_crawled_by_id(request, crawled_id): """Get list of website where we crawled from by crawled id""" logger = logging.getLogger(__name__) scripture = databases("scripture") if len(crawled_id) == 24: crawled = await scripture.capture_urls.find_one( {"_id": ObjectId(crawled_id)}) else: crawled = await scripture.capture_urls.find_one( {"hotel_id": crawled_id}) if not crawled: logger.warning("Bad crawled id %s", crawled_id) return rest_result(request, { "status": 400, "err_msg": "Invalid capture_id" }) filters = request.args.get("filters") if filters and isinstance(filters, str): kwargs = { "hotels_cn_id": crawled.get("hotels_cn_id"), "jset_id": crawled.get("jset_id"), "bookings_id": crawled.get("bookings_id"), "capture_id": crawled_id, } logging.debug(f"kwargs : {kwargs}") hotel = await request.app.loop.run_in_executor( None, lambda: hub_hotel.HubHotel(**kwargs).to_dict(columns=filters.split( ",")), ) logging.info(f"crawled_id:{crawled_id} ,api hotel : {hotel}") keys = [f.strip() for f in filters.split(",")] if "all" in keys: logger.info( f'crawled_id:{crawled_id} crawled success and "all" in keys ') return rest_result(request, {"hotel": hotel, "status": 200}) hotel = {key: filed_formatter(key, hotel.get(key)) for key in keys} logger.info(f'crawled_id:{crawled_id} crawled success ') return rest_result(request, {"hotel": hotel, "status": 200}) urls = {} if crawled.get("hotels_cn_id") or crawled_id.get("_hotels_cn_id"): urls["hotels.cn"] = ( "https://www.hotels.cn/" f'ho{crawled.get("hotels_cn_id") or crawled_id.get("_hotels_cn_id")}' ) # noqa if crawled.get("jset_id") or crawled_id.get("_jset_id"): jset = await scripture.jsets.find_one( {"jset_id": crawled.get("jset_id") or crawled_id.get("_jset_id")}) if not jset: logger.warning("Jset not found %s", crawled.get("jset_id")) else: urls["jetsetter.com"] = jset["url"] if crawled.get("hotel_id") and not crawled.get("bookings_id"): urls["roomsxml id"] = crawled.get("hotel_id") if crawled.get("bookings_id") or crawled.get("_bookings_id"): urls["bookings"] = crawled.get("bk_url") if urls: logger.info(f'crawled_id:{crawled_id} , found url:{urls}') return rest_result(request, {"urls": urls, "status": 200}) logger.info(f'crawled_id:{crawled_id} , not found url') return rest_result(request, {"err_msg": "Not Found", "status": 404})
async def crawl_hcom(capture_id, url): logger = logging.getLogger(__name__) scripture = databases("scripture") if len(capture_id) == 24: crawled = await scripture.capture_urls.find_one( {"_id": ObjectId(capture_id)}) else: crawled = await scripture.capture_urls.find_one( {"hotel_id": capture_id}) if not crawled: logger.warning("Bad crawled id %s", capture_id) return {"status": 400, "errmsg": "Invalid capture_id"} targets = ["www.hotels.cn", "www.hotels.com"] hid = re.match("\d+", url) if hid: hid = hid.group() cn_url = f"https://www.hotels.cn/ho{hid}" en_url = f"https://www.hotels.com/ho{hid}/?pos=HCOM_US&locale=en_US" else: _url = URL(url) if _url.host in targets: hid = _url.path.strip("/").split("/")[0][2:] cn_url = f"https://www.hotels.cn/ho{hid}" en_url = ( f"https://www.hotels.com/ho{hid}/?pos=HCOM_US&locale=en_US") else: logger.warning("Bad url %s", url) return {"status": 400, "errmsg": "Invalid url"} cn_req = requests.get(cn_url) cn_req = cn_req.content.decode("utf-8") cn_et = etree.HTML(cn_req) title = get_log(cn_et, field="title", rule=hotels_xp.TITLE, choice="take_first") if title == "好订网酒店预订 国际酒店预订 特价国外酒店预订 – 网上订酒店就到Hotels.cn": logger.warning("Bad hotel id %s", hid) return {"status": 400, "errmsg": "Invalid url"} payload = hcom_parse(hid, cn_url, cn_et) headers = { "User-Agent": "Mozilla/5.0 (Windows NT 10.0; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/69.0.3497.100 Safari/537.36" } en_req = requests.get(en_url, headers=headers) en_req = en_req.content.decode("utf-8") en_et = etree.HTML(en_req) en_payload = hcom_parse(hid, en_url, en_et) payload["url"] = cn_url payload["us_url"] = en_url payload["en"] = en_payload if payload["address_text"] and payload["name"]: res = await scripture.hotels.update_one({"hotels_id": hid}, {"$set": payload}, upsert=True) if res.modified_count: await scripture.capture_urls.update_one( {"_id": crawled["_id"]}, { "$set": { "_hotels_cn_id": "", "hotels_cn_id": hid, "bookings_id": "", "_bookings_id": crawled.get("bookings_id"), "jset_id": "", "_jset_id": crawled.get("jset_id"), } }, ) logger.info(f'hotels_cn_id:{hid} upload success') return {"status": 200, "data": payload} else: logger.info(f'hotels_cn_id:{hid} upload fail') return {"status": 500, "errmsg": f"酒店静态数据更新失败!"} else: if not payload["name"]: logger.error(f"酒店名称抓取失败,url:{en_url},xpath:{hotels_xp.NAME}") if not payload["address_text"]: logger.error(f"酒店名称抓取失败,url:{en_url},xpath:{hotels_xp.ADDRESS}") return {"status": 500, "errmsg": f"酒店静态数据更新失败!"}
async def crawl_statics_data(request, crawled_id): logger = logging.getLogger(__name__) url = request.args.get("url") _url = URL(url) url = f"{_url.scheme}://{_url.host}{_url.path}" website = request.args.get("website") scripture = databases("scripture") if not website or website not in crawling: logger.warning("Bad website : not website or website not in crawling") return rest_result(request, { "status": 400, "err_msg": "Invalid website!" }) if len(crawled_id) == 24: exists = await scripture.capture_urls.find_one( {"_id": ObjectId(crawled_id)}) else: exists = await scripture.capture_urls.find_one( {"hotel_id": crawled_id}) if not exists: logger.info(f"invalid crawled_id!: {crawled_id}") return rest_result(request, { "status": 400, "errmsg": f"invalid crawled_id!: {crawled_id}" }) if website in exists and exists[website] == url: logger.info(f'{crawled_id} 此网站已经抓取') return rest_result(request, {"status": 200, "data": "此网站已经抓取"}) if website == "bk_url": try: res = await crawl_booking(str(exists["_id"]), url) except Exception as exc: logging.warning("", exc_info=exc) res = { "status": 500, "errmsg": f"网站抓取失败,请联系刘博文同学([email protected])\n{exc}", } elif website == "hcom_id": try: res = await crawl_hcom(crawled_id, url) except Exception as exc: logging.warning("", exc_info=exc) res = { "status": 500, "errmsg": f"网站抓取失败,请联系刘博文同学([email protected])\n{exc}", } else: logger.warning(f'Invalid website:{website}') return rest_result(request, { "status": 400, "err_msg": "Invalid website!" }) if res and res.get('status') == 200: scripture.capture_urls.update_one({"_id": exists["_id"]}, {"$set": { website: url }}) logger.info(f'url:{url},酒店静态数据抓取完成') return rest_result(request, {"status": 200, "data": "酒店静态数据抓取完成!"}) else: logger.warning(f'{crawled_id}抓取失败.errmsg: {res["errmsg"]}') return rest_result(request, { "status": 500, "errmsg": f"{res['errmsg']}" })