Python classifyの例、CrawlerSystemUntils.CrawlerUntils.classify Pythonの例

コード例 #1

0

ファイルを表示

ファイル: CtripPrice_new.py プロジェクト: wyc1314/CrawlerSystem

 def get_price(self, response):
     item = response.meta.get("item")
     # print("neirong:",response.body.decode())
     BLOCK = extract_re(""""html":(.*?)isFullHouse""",
                        response.body.decode())
     RECORDS = re.findall("""room_unfold(.*?)class='clicked hidden""",
                          BLOCK)
     for RECORD in RECORDS:
         # 房型名称
         RoomName = extract_re(r"""RoomName\\":\\"(.*?)\\""", RECORD)
         item["ROOM_TYPE"] = RoomName
         RECORDS2 = re.findall(
             r"""data-hotelInvoice(.*?class=\\"hotel_room_last\\">.*?<\\/div>)""",
             RECORD)
         for RECORD2 in RECORDS2:
             itemValue = copy.deepcopy(item)
             # 产品名称
             itemValue["PRODUCT_TYPE"] = extract_re(
                 r"""(room_type_name\\".*?background-image:url\(|room_type_name\\".*?)([^>"]*?)(<br\\/>[^']|\)\\"><|\\/span>|<\\/[es])""",
                 RECORD2,
                 group_num=2)
             # 预定方式
             pay_type = extract_re(r"""payment_txt\\".*?>(.*?)<""", RECORD2)
             map_pay_type = classify(
                 {
                     "0": "(在线付)",
                     "2": "(担保)",
                     "1": "(到店付)"
                 }, pay_type)
             itemValue[
                 "PAYMENT_TYPE"] = map_pay_type if map_pay_type else "null"
             # 代理
             daili = extract_re(r"""data-role=\\"title\\">(.*?)<\\/span>""",
                                RECORD2)
             itemValue["IS_NOT_AGENT"] = daili if daili else "true"
             # 预订状态
             pay_status = extract_re(r"""btns_base22_main\\">(.*?)<""",
                                     RECORD2)
             itemValue["AVAILABLE_ROOM_SITUATION"] = classify(
                 {
                     "可预订": "(预订)",
                     "满房": "(订完)"
                 }, pay_status)
             # 早餐
             BREAKFAST = extract_re(r"""col4'>(.*?)<""", RECORD2)
             itemValue["BREAKFAST"] = BREAKFAST if BREAKFAST else "null"
             # 原价
             itemValue["ORIGINAL_PRICE"] = extract_re(
                 r"""data-price='(\d+)'""", RECORD2)
             # 套餐价格
             taocan_price = extract_re(
                 r"""rt_origin_price\\"><dfn>&yen;<\\/dfn>(.*?)<""",
                 RECORD2)
             itemValue[
                 "DISCOUNT_PRICE"] = taocan_price if taocan_price else "0"
             # 返减
             fanjian = extract_re(r"""span>返现(.*?)<""", RECORD2)
             itemValue["DISCOUNT"] = fanjian if fanjian else "0"
             print(itemValue)
             yield itemValue

コード例 #2

0

ファイルを表示

 def get_home_review(self,response):
     eleven = response.meta["eleven"]
     html = response.body_as_unicode()
     if response.meta.get("page",1) ==1:
         item = response.meta["item"]
         ctripZongHeItem = response.meta["ctripZongHeItem"]
         ctripZongHeItem["RATING_VALUE"] = extract_re("<span class='score'><span class='n'>(.*?)</span>", html)
         ctripZongHeItem["REVIEW_COUNT"] = extract_re("<span id='All_Comment' >全部\((\d+)\)", html)
         ctripZongHeItem["GUEST_TYPE"] = "null"
         print(ctripZongHeItem)
         yield ctripZongHeItem
         for page in range(2, int(self.next_page_num) + 1):
             url = "http://hotels.ctrip.com/Domestic/tool/AjaxHotelCommentList.aspx?MasterHotelID={hotel_id}&hotel={hotel_id}" \
                   "&NewOpenCount=0&AutoExpiredCount=0&RecordCount=1253&OpenDate=2016-01-01&card=-1&property=-1&userType=-1&" \
                   "productcode=&keyword=&roomName=&orderBy=1&viewVersion=c&currentPage={page}&contyped=0&" \
                   "callback=CASwaGVffbjQXgEzk&eleven={eleven}".format(hotel_id=item["ctrip_hotel_id"], page=page,
                                                                       eleven=eleven)
             yield Request(url, headers=response.request.headers, cookies=response.request.cookies, callback=self.get_home_review, priority=20+self.__class__.num,
                                  meta={"item": copy.deepcopy(item), "page": page, "eleven": eleven,"is_need_proxy":True},
                                  )
     RECORDS = re.findall("class='comment_block J_asyncCmt'(.*?)</div></div></div>",html,re.DOTALL)
     for RECORD in RECORDS:
         item = response.meta["item"]
         USER_NAME = extract_re("class='name'><span>(.*?)</span>", RECORD)
         REVIEW_RATING_VALUE = extract_re("<span class='n'>([0-9]\d*\.?\d*)</span>分", RECORD)
         ACCOMMODATION_TIME = extract_re("class='time'>发表于(.*?)</span>", RECORD)
         RAW_REVIEW_CONTENT = extract_re("class='J_commentDetail'>(.*?)</div>", RECORD)
         REVIEW_ID = extract_re("data-cid='(.*?)'", RECORD)
         USER_AVATAR = extract_re("class='head'.*?img src='(.*?)'", RECORD)
         IS_NOT_REPLY = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>",
                                        RECORD) if "false" else "true"  # 1 有回复内容，0没有
         REPLY_REVIEW_CONTENT = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD)
         RAW_PICTURES = extract_re("class='comment_pic'(.*?)class='comment_bar'", RECORD)
         sourceSiteId = classify({"7": "(艺龙网用户)", "1": "(去哪儿网用户)", "2": "(.+)"}, USER_NAME)
         item["userName"] = USER_NAME
         item["reviewRatingValue"] = REVIEW_RATING_VALUE
         item["accommodationTime"] = ACCOMMODATION_TIME + " 00:00:00"
         item["replyReviewContent"] = REPLY_REVIEW_CONTENT
         item["reviewId"] = REVIEW_ID
         item["userAvatar"] = USER_AVATAR
         item["isNotReply"] = "false" if IS_NOT_REPLY else "true"
         item["rawReviewContent"] = RAW_REVIEW_CONTENT
         item["RAW_PICTURES"] = RAW_PICTURES
         EMOTION_TYPE = classify(
             {"0": "(1\\.|2\\.|3\\.0|3\\.1|3\\.2|3\\.3|3\\.4)", "2": "(3\\.5|3\\.6|3\\.7|3\\.8|3\\.9)",
              "1": "(^4$|5|4\\.)"}, REVIEW_RATING_VALUE)
         item["emotionType"] = EMOTION_TYPE
         item["sourceSiteId"] = sourceSiteId
         print(item)
         yield item

コード例 #3

0

ファイルを表示

ファイル: MeiTuanReview.py プロジェクト: wyc1314/CrawlerSystem

 def get_review(self, response):
     res = json.loads(response.body_as_unicode())
     feedbacks = res['data']['feedback']
     for feedback in feedbacks:
         REVIEW_ID = feedback.get("id", "")
         REPLY_REVIEW_TIME = feedback.get('replytime', '')
         REVIEW_RATING_VALUE = feedback.get('score', '')
         ACCOMMODATION_TIME = feedback.get('feedbacktime', '')
         USER_AVATAR = feedback.get('avatar', '')
         REPLY_REVIEW_CONTENT = feedback.get('bizreply', '')
         RAW_REVIEW_CONTENT = feedback.get('comment', '')
         USER_NAME = feedback.get('username', '')
         RAW_PICTURES = ''
         USER_ID = feedback.get('userid', '')
         item = copy.deepcopy(response.meta["item"])
         item['replyReviewTime'] = REPLY_REVIEW_TIME
         item["userName"] = USER_NAME
         item["reviewRatingValue"] = REVIEW_RATING_VALUE
         item["accommodationTime"] = ACCOMMODATION_TIME + " 00:00:00"
         item["replyReviewContent"] = REPLY_REVIEW_CONTENT
         item["reviewId"] = REVIEW_ID
         item["userAvatar"] = USER_AVATAR if USER_AVATAR else ""
         item["isNotReply"] = 'false' if REPLY_REVIEW_CONTENT else 'true'
         item["rawReviewContent"] = RAW_REVIEW_CONTENT
         item["rawPictures"] = RAW_PICTURES if RAW_PICTURES else ""
         item['userId'] = USER_ID
         item["userLocation"] = ""
         item["emotionType"] = classify(
             {
                 "0": "(1|2)",
                 "2": "(3)",
                 "1": "(4|5)"
             }, REVIEW_RATING_VALUE)
         item["sourceSiteId"] = item.get("siteId", "6")
         yield item

コード例 #4

0

ファイルを表示

ファイル: MeiTuanPrice.py プロジェクト: wyc1314/CrawlerSystem

 def get_meituran_price(self, response):
     response_dict = json.loads(response.body_as_unicode())
     for RECORD in response_dict.get("mergeList", {}).get("data", {}):
         item = response.meta.get("item")
         set_.add(item["BIG_DATA_HOTEL_ID"])
         print(set_.__len__())
         item["ROOM_TYPE"] = RECORD.get("roomCellName", "")
         for RECORD2 in RECORD.get("aggregateGoods", []):
             itemValue = copy.deepcopy(item)
             # 产品名称
             PRODUCT_TYPE = RECORD2.get("aggregateGoodName", "")
             itemValue[
                 "PRODUCT_TYPE"] = PRODUCT_TYPE if PRODUCT_TYPE else "null"
             # 预订状态
             pay_status = RECORD2["prepayGood"]["goodsStatus"]
             itemValue["AVAILABLE_ROOM_SITUATION"] = classify(
                 {
                     "满房": "(0)",
                     "可预订": "(1)"
                 }, str(pay_status))
             # 早餐
             itemValue["BREAKFAST"] = RECORD2["prepayGood"].get(
                 "breakfast", "null")
             # 原价
             ORIGINAL_PRICE = RECORD2["prepayGood"]["averagePrice"]
             itemValue[
                 "ORIGINAL_PRICE"] = ORIGINAL_PRICE if ORIGINAL_PRICE else ""
             # 代理
             daili = RECORD2["prepayGood"].get("tagName", "")
             itemValue["IS_NOT_AGENT"] = daili if daili else "true"
             # 预定方式
             pay_type = RECORD2.get("prepayGood",
                                    {}).get("reserveTips", "0")
             itemValue["PAYMENT_TYPE"] = classify({"0": "(在线付)"}, pay_type)
             # 均价（已减）
             itemValue[
                 "DISCOUNT_PRICE"] = ORIGINAL_PRICE if ORIGINAL_PRICE else ""
             # 返减
             itemValue["DISCOUNT"] = "0"
             # print(itemValue)
             yield itemValue

コード例 #5

0

ファイルを表示

ファイル: BB.py プロジェクト: wyc1314/CrawlerSystem

for RECORD in RECORDS:
    num +=1
    print(num)
    item = {}
    USER_NAME = extract_re("class='name'><span>(.*?)</span>", RECORD)
    REVIEW_RATING_VALUE = extract_re("<span class='n'>([0-9]\d*\.?\d*)</span>分", RECORD)
    ACCOMMODATION_TIME = extract_re("class='time'>发表于(.*?)</span>", RECORD)
    RAW_REVIEW_CONTENT = extract_re("class='J_commentDetail'>(.*?)</div>", RECORD)
    print(RAW_REVIEW_CONTENT)
    REVIEW_ID = extract_re("data-cid='(.*?)'", RECORD)
    USER_AVATAR = extract_re("class='head'.*?img src='(.*?)'", RECORD)
    IS_NOT_REPLY = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>",
                              RECORD) if "false" else "true"  # 1 有回复内容，0没有
    REPLY_REVIEW_CONTENT = extract_re("class='htl_reply'.*?class='text.*?>(.*?)</p>", RECORD)
    RAW_PICTURES = extract_re("class='comment_pic'(.*?)class='comment_bar'", RECORD)
    sourceSiteId = classify({"7": "(艺龙网用户)", "1": "(去哪儿网用户)", "2": "(.+)"}, USER_NAME)
    item["userName"] = USER_NAME
    item["reviewRatingValue"] = REVIEW_RATING_VALUE
    item["accommodationTime"] = ACCOMMODATION_TIME + " 00:00:00"
    item["replyReviewContent"] = REPLY_REVIEW_CONTENT
    item["reviewId"] = REVIEW_ID
    item["userAvatar"] = USER_AVATAR
    item["isNotReply"] = "false" if IS_NOT_REPLY else "true"
    item["rawReviewContent"] = RAW_REVIEW_CONTENT
    item["RAW_PICTURES"] = RAW_PICTURES
    EMOTION_TYPE = classify(
        {"0": "(1\\.|2\\.|3\\.0|3\\.1|3\\.2|3\\.3|3\\.4)", "2": "(3\\.5|3\\.6|3\\.7|3\\.8|3\\.9)",
         "1": "(^4$|5|4\\.)"}, REVIEW_RATING_VALUE)
    item["emotionType"] = EMOTION_TYPE
    item["sourceSiteId"] = sourceSiteId
    print(item)