return json.dumps({'grade_list': p(
        'div:nth-child(3) > div.review-filter > ul.ta-list.clearfix > li.taService').text().split('\n'),
                      'comment_num_list': p(
                          'div:nth-child(3) > div.review-filter > ul.filter-list.clearfix').text().split('\n')[1:]},
                     ensure_ascii=False)

fl_shop2 = Fieldlist(
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='#hotel-page > div > div.hotel-box.hotel-baseinfo > div.info > div.base > p.address', offset=6, try_times=10, pause_time=5),
    Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL, css_selector='#J_RoomList', attr='innerHTML', filter_func=get_room_all),
    Field(fieldname=FieldName.SHOP_INTRO, css_selector='#hotel-desc', attr='innerHTML', filter_func=get_shop_intro),
    Field(fieldname=FieldName.SHOP_FACILITIES, css_selector='#hotel-facility', attr='innerHTML', filter_func=get_shop_facility),
    Field(fieldname=FieldName.SHOP_TRAFFIC, css_selector='#rich-map-wrap', attr='innerHTML', filter_func=get_shop_traffic),
    Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#hotel-review', attr='innerHTML', filter_func=get_shop_statistics),
)

page_shop_1 = Page(name='飞猪酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#J_List > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection))

page_shop_2 = Page(name='飞猪酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.row-center > div > h5 > a'),mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True)

class FliggyHotelSpider(TravelDriver):

    def page_shop_2_func(self):
        try:
            self.move_to_element_by_css_selector(css_selector='#rich-map-wrap > div.J_RichCon > div.tabs > ul')
            self.vertical_scroll_by(offset=-200)
            for i in self.until_presence_of_all_elements_located_by_css_selector(css_selector='#rich-map-wrap > div.J_RichCon > div.tabs > ul > li.J_Tab'):
                i.click()
        except Exception:
            self.error_log(e='找不到元素')
        time.sleep(3)
    Field(fieldname=FieldName.SHOP_IMG, css_selector=' div > div.flt1 > a > img', attr='src',is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.ct-text > ul > li:nth-child(1) > a',
          is_info=True),

    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade),
    #正则表达式不一样
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.ct-text > ul > li:nth-child(2) > a',filter_func=get_comment_num, is_info=True),

    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > div.ct-text > p',is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,css_selector= '',filter_func=get_shop_price, is_info=True)
)


fl_shop2 = Fieldlist(
)
page_shop_1 = Page(name='马蜂窝景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#_j_search_result_left > div:nth-child(1) > div > ul > li',), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True)
page_shop_2 = Page()
page_shop_2 = Page(name='马蜂窝景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.ct-text > h3 > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection))




def get_comment_grade(self,_str):
    return str(_str[-1])
def get_comment_time(self,_str):
    #时间格式统一为2018-12-08

    return _str[0:10]
def get_comment_year(self,_str):
    time = _str[0:10]
    return time[0:4];
  Field(fieldname=FieldName.SHOP_GRADE,css_selector='div > div.hotel-brief.fl > div.satisfaction > span.highlight',is_info=True),
    #正则表达式不一样
#mainHotelLeft > div:nth-child(2) > div > div:nth-child(2) > ul > li:nth-child(2)
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.hotel-brief.fl > div.comment > a > span', is_info=True),

    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='div > div.hotel-info.fl > div.nameAndIcon > span.decorate_year',is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE, css_selector='div > div.hotel-brief.fl > div.startPrice > span.digit', is_info=True),
    Field(fieldname=FieldName.SHOP_RATE, css_selector='',filter_func=get_shop_rate,
          is_info=True),

)

fl_shop2 = Fieldlist()

page_shop_1 = Page(name='途牛酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#main > div.hotel-list > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True)
page_shop_2 = Page()
page_shop_2 = Page(name='途牛酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.hotel-info.fl > div.nameAndIcon > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection))



def get_comment_user_name(self, _str):
    return _str.split(' ')[0]

def get_comment_time(self, _str):
    return re.findall(r'([\d]{4}-[\d]{2}-[\d]{2} [\d]{2}:[\d]{2})',_str)[0]
def get_comment_grade(self,_str):
    #判断如果含有好 高
    if ('好' in _str) and ('但是' in _str) == False:
        return  str("5.0")
    elif ('好' in _str) and ('但是' in _str) == True:
Example #4
0
        'div > div > div.clrfix > div.item_hotel_info > div.item_hotel_bsinfo > table > tbody > tr > td.item_hotel_name > div > div.level.levelmargin > a.level_score.js_list_score > strong',
        is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          filter_func=get_shop_rate,
          is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE,
          filter_func=get_shop_feature,
          css_selector='',
          is_info=True))

fl_shop2 = Fieldlist()
page_shop_1 = Page(name='去哪儿酒店店铺列表页面',
                   fieldlist=fl_shop1,
                   listcssselector=ListCssSelector(
                       list_css_selector='#jxContentPanel > div', ),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection),
                   is_save=True)

page_shop_2 = Page(
    name='去哪儿酒店店铺详情页面',
    fieldlist=fl_shop2,
    tabsetup=TabSetup(click_css_selector='a.e_title.js_list_name'),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection))


def get_comment_grade(self, _str):
    return str(_str[-1])
Example #5
0
        is_info=True),
    #无shop_feature
    Field(fieldname=FieldName.SHOP_FEATURE,
          css_selector='',
          filter_func=get_shop_feature,
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          filter_func=get_shop_rate,
          is_info=True),
)
page_shop_1 = Page(
    name='去哪儿景点店铺列表页面',
    fieldlist=fl_shop1,
    listcssselector=ListCssSelector(
        list_css_selector=
        '#main-page > div.mp-main > div:nth-child(2) > ul > li'),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection),
    is_save=True)

fl_shop2 = Fieldlist(
    Field(
        fieldname=FieldName.SHOP_NAME,
        css_selector=
        '#main-page > div.mp-main > div.mp-headfigure > div.mp-headfeagure-info > div'
    ),
    Field(
        fieldname=FieldName.SHOP_COMMENT_URL,
        css_selector=
        '#main-page > div.mp-main > div.mp-baseinfo > div.mpg-flexbox.mp-flex-card > div:nth-child(1) > a',
Example #6
0
#\33 6822720 > div:nth-child(1) > div
    Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.img-container.lazy-img-box.fl > img', attr='src', is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= '',filter_func=get_shop_address, is_info=True),
    #这里应该做一个转换
#\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade, is_info=True),
    #正则表达式的使用有问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-detail > p',is_info=True),
    #无shop_feature

    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='',filter_func=get_shop_feature, is_info=True),

    Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True),
Field(fieldname=FieldName.SHOP_COMMENT_URL,css_selector='a',attr='href',filter_func=get_shop_comment_url, is_info=True)
)
page_shop_1 = Page(name='途牛景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#search-container > section > div > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True)
fl_shop2 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='#main-page > div.mp-main > div.mp-headfigure > div.mp-headfeagure-info > div'),
    Field(fieldname=FieldName.SHOP_COMMENT_URL,
          css_selector='#main-page > div.mp-main > div.mp-baseinfo > div.mpg-flexbox.mp-flex-card > div:nth-child(1) > a',
          attr='href', is_info=True)
)

page_shop_2 = Page(name='途牛景点店铺详情页面', fieldlist=fl_shop2)

def get_comment_user_name(self,_str):
    comment_user_name = _str.split(' ')[0];
    return comment_user_name;

    Field(fieldname=FieldName.SHOP_NAME,css_selector=' div > div.h_info > div.h_info_text > div.h_info_base > p.h_info_b1 > a > span.info_cn',attr='innerHTML', is_info=True),

    Field(fieldname=FieldName.SHOP_URL,css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a',attr='href',is_info=True),
    Field(fieldname=FieldName.SHOP_IMG, css_selector='div.h_info_pic > a > img', attr='big-src',is_info=True),
    #有些问题
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b2',is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,css_selector='div > div.h_info_text > div.h_info_pri > p:nth-child(1) > a > span.h_pri_num',is_info=True),
    #稍许有些问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='div > div.h_info_text > div.h_info_comt > a > span.c555.block.mt5'),
    Field(fieldname=FieldName.SHOP_GRADE, css_selector=' div > div.h_info_text > div.h_info_comt > a > span.h_info_comt_bg > i.c37e',is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True),
    Field(fieldname=FieldName.SHOP_FEATURE,css_selector='',filter_func=get_shop_feature, is_info=True)

)
fl_shop2 = Fieldlist()
page_shop_1 = Page(name='艺龙酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#hotelContainer > div > div'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True)
# page_shop_2 = Page()
#
page_shop_2 = Page(name='艺龙酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div > div.h_info_text > div.h_info_base > p.h_info_b1 > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection))
fl_comment1 = Fieldlist(
    Field(fieldname=FieldName.COMMENT_USER_NAME, css_selector=' div.cmt_userinfo > div > p.cmt_un',is_info=True),
    Field(fieldname=FieldName.COMMENT_TIME, css_selector='div.cmt_info_mn > div > div.if_hd_r > span.cmt_con_time', is_info=True),
    Field(fieldname=FieldName.SHOP_NAME,
          css_selector='body > div.hdetail_rela_wrap > div > div.hrela_ns_wrap.clearfix > div.hdetail_main.hrela_name > div > h1',
          is_isolated=True, is_info=True),
    Field(fieldname=FieldName.COMMENT_CONTENT, css_selector='div.cmt_info_mn > p.cmt_txt',is_info=True),
    #有问题
    Field(fieldname=FieldName.COMMENT_SCORE, css_selector='div.cmt_info_mn > div > div.if_hd > b',is_info=True),
)

page_comment_1 = Page(name='艺龙酒店评论列表', fieldlist=fl_comment1, listcssselector=ListCssSelector(list_css_selector='#review > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.comments_collection), is_save=True)
Example #8
0
        ' div.product-regular.clearfix > div.product-section > dl:nth-child(6) > dd > div'
    ),
    Field(
        fieldname=FieldName.SHOP_GRADE,
        css_selector=
        'div.product-regular.clearfix > div.product-info > ul > li:nth-child(1) > b',
        filter_func=get_shop_grade,
        is_info=True),
)

fl_shop2 = Fieldlist()

page_shop_1 = Page(name='驴妈妈景点店铺列表页面',
                   fieldlist=fl_shop1,
                   listcssselector=ListCssSelector(
                       list_css_selector=' div.product-list > div',
                       item_css_selector='div'),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection),
                   is_save=True)
page_shop_2 = Page()
page_shop_2 = Page(
    name='驴妈妈景点店铺详情页面',
    fieldlist=fl_shop2,
    tabsetup=TabSetup(
        click_css_selector=
        'div.product-regular.clearfix > div.product-section > h3 > a'),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection),
    is_save=False)

Example #9
0
# fl_shop2 = Fieldlist(
#     Field(fieldname=FieldName.SHOP_PRICE, css_selector='div.piao_wrap redraw > div.mp-description.pngfix > div.mp-description-detail > div.mp-description-price > span.mp-description-qunar-price > em', pause_time=3, is_focus=True, is_info=True),
#     #评论次数还要改善
#     Field(fieldname=FieldName.SHOP_TIME,css_selector="#mp-charact > div >  div.mp-charact-time > div.mp-charact-content > div.mp-charact-desc > p"),
#     Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.piao_wrap redraw > div.mp-description.pngfix > div.mp-description-detail > div.mp-description-comments > span.mp-description-commentCount > a', is_focus=True),
#     Field(fieldname=FieldName.SHOP_SERVICE,css_selector='#root > div > div > div > div > div:nth-child(3) > div.main-bd > div > div.brief-box.clearfix > div.brief-right > ul > li.promise',attr='innerHTML', filter_func=get_shop_service, is_focus=True),
#     Field(fieldname=FieldName.SHOP_TICKET, css_selector='div.mp-tickets',attr='innerHTML', filter_func=get_shop_ticket, is_focus=True),
#     Field(fieldname=FieldName.SHOP_INFO, css_selector='#mp-charact > div >  div.mp-charact-intro > div.mp-charact-desc > p', attr='innerHTML', filter_func=get_shop_info, is_focus=True)
# )
#

page_shop_1 = Page(name='去哪儿景点店铺列表页面',
                   fieldlist=fl_shop1,
                   listcssselector=ListCssSelector(
                       list_css_selector='#search-list > div',
                       item_css_selector='div'),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection))

#page_shop_2 = Page(name='去哪儿景点店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div.sight_item_detail.clrfix > div.sight_item_about >  h3 > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection), is_save=True)
#
# def get_comment_user_name(self, _str):
#     return _str.split(' ')[0]
#
# def get_comment_time(self, _str):
#     return re.findall(r'([\d]{4}-[\d]{2}-[\d]{2} [\d]{2}:[\d]{2})',_str)[0]
#
# fl_comment1 = Fieldlist(
#     Field(fieldname=FieldName.COMMENT_USER_NAME, css_selector='span.mp-comments-username', filter_func=get_comment_user_name),
#     Field(fieldname=FieldName.COMMENT_TIME, css_selector='span.mp-comments-time', filter_func=get_comment_time),
Example #10
0
    #img还有些许问题
#\33 6822720 > div:nth-child(1) > div
    Field(fieldname=FieldName.SHOP_IMG, css_selector='a > div.img-container.lazy-img-box.fl > img', attr='src', is_info=True),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector= '',filter_func=get_shop_address, is_info=True),
    #这里应该做一个转换
#\34 187 > div:nth-child(2) > div:nth-child(3) > div > div:nth-child(2) > span:nth-child(1)
    Field(fieldname=FieldName.SHOP_GRADE,css_selector='',filter_func=get_shop_grade, is_info=True),
    #正则表达式的使用有问题
    Field(fieldname=FieldName.SHOP_COMMENT_NUM,css_selector='a > div.search-scenic-content > div.search-scenic-wrapper > div.search-scenic-detail > p',is_info=True),
    #无shop_feature

    Field(fieldname=FieldName.SHOP_FEATURE, css_selector='',filter_func=get_shop_feature, is_info=True),

    Field(fieldname=FieldName.SHOP_RATE,css_selector='',filter_func=get_shop_rate, is_info=True)
)
page_shop_1 = Page(name='途牛景点店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#search-container > section > div > ul > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True)

def get_comment_grade(self,_str):
    doc = pq(_str)
    if (doc('.star-active').length) == 3:
        return "5"
    elif (doc('.star-active').length) == 2:
        return "2.5"
    else:
        return "0"

def get_comment_year(self,_str):
    time =_str[0:10]
    return time[0:4];

def get_comment_season(self, _str):
Example #11
0
          is_debug='True',
          filter_func=get_shop_grade,
          is_info=True),
)

# fl_shop2 = Fieldlist(
#     Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='div.container > div.hotel-intro > div.intro-hd > div.location > span', attr='title', offset=6, try_times=10, pause_time=1),
#     Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL, css_selector='#_j_booking_info', attr='innerHTML', filter_func=get_shop_room_all, offset=6, try_times=10, pause_time=2),
#     Field(fieldname=FieldName.SHOP_TRAFFIC, css_selector='#_j_map_poi_list > div.bd', attr='innerHTML', filter_func=get_shop_traffic, offset=6, try_times=10, pause_time=1),
#     Field(fieldname=FieldName.SHOP_FACILITIES, css_selector='#_j_hotel_info', attr='innerHTML', filter_func=get_shop_facilities, offset=6, try_times=10, pause_time=1),
#     Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#_j_comment', attr='innerHTML', filter_func=get_shop_stattistics),
# )

page_shop_1 = Page(name='马蜂窝酒店店铺列表页面',
                   fieldlist=fl_shop1,
                   listcssselector=ListCssSelector(
                       list_css_selector='#_j_hotel_list > div.hotel-item'),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection))

# page_shop_2 = Page(name='马蜂窝酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div.hotel-pic > a'),mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True)


class MafengwoHotelSpider(TravelDriver):
    def get_shop_info(self):
        try:
            shop_data_list = self.from_page_get_data_list(page=page_shop_1)
            # self.from_page_add_data_to_data_list(page=page_shop_2, data_list=shop_data_list, pre_page=page_shop_1)
        except Exception as e:
            self.error_log(e=e)

    def get_shop_info_list(self):
Example #12
0
          pause_time=5),
    Field(fieldname=FieldName.SHOP_INTRO,
          css_selector='#hotelIntroduction > div.hotel_introduction_body'),
    Field(fieldname=FieldName.SHOP_TRAFFIC,
          css_selector='#hotelTraffic',
          attr='innerHTML',
          filter_func=get_shop_traffic),
    Field(fieldname=FieldName.SHOP_STATISTICS,
          css_selector='#hotelUserComment',
          attr='innerHTML',
          filter_func=get_shop_statistics),
)

page_shop_1 = Page(name='途牛酒店店铺列表页面',
                   fieldlist=fl_shop1,
                   listcssselector=ListCssSelector(
                       list_css_selector='#main > div.hotel-list > div'),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection))

page_shop_2 = Page(
    name='途牛酒店店铺详情页面',
    fieldlist=fl_shop2,
    tabsetup=TabSetup(
        click_css_selector='div.hotel-brief.fl > div.hotelDetail > a'),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection),
    is_save=True)


class TuniuHotelSpider(TravelDriver):
    def page_shop_2_func(self):
Example #13
0
        tag_list.append(i.text())
    statistics.setdefault('tag_list', tag_list)
    comment_num_list = []
    for i in p('dl.rank').items('dd'):
        comment_num_list.append({re.sub(r'[^\u4e00-\u9fa5]*', '', i.text()): re.sub(r'[^\d]*', '', i.text())})
    statistics.setdefault('comment_num_list', comment_num_list)
    return json.dumps(statistics, ensure_ascii=False)

fl_shop2 = Fieldlist(
    Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL, css_selector='div.m-room-tools-bd.js-roomtool-rooms.caculate-price', attr='innerHTML', filter_func=get_room_all, pause_time=5),
    Field(fieldname=FieldName.SHOP_TRAFFIC, css_selector='#js-neighbor', attr='innerHTML', filter_func=get_shop_traffic, pause_time=1),
    Field(fieldname=FieldName.SHOP_FACILITIES, css_selector='#descContent', attr='innerHTML', filter_func=get_shop_facilities),
    Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#comment_main > div > div.b_ugcheader > div.b_ugcfilter', attr='innerHTML', filter_func=get_shop_statistics),
)

page_shop_1 = Page(name='去哪儿酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='div.b_result_box.js_list_block.b_result_commentbox'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection))

page_shop_2 = Page(name='去哪儿酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='a.e_title.js_list_name'),mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True)

class QunarHotelSpider(TravelDriver):

    def page_shop_2_func(self):
        try:
            for i in self.until_presence_of_all_elements_located_by_partial_link_text(link_text='查看其他'):
                i.click()
        except Exception:
            self.error_log(e='找不到元素')
        try:
            for i in self.until_presence_of_all_elements_located_by_partial_link_text(link_text='展开报价'):
                i.click()
        except Exception:
    return p.text()


fl_shop1 = Fieldlist(
    Field(fieldname=FieldName.SHOP_NAME, css_selector='dl > dt > a'),
    Field(fieldname=FieldName.SHOP_URL,
          css_selector='dl > dt > a',
          attr='href',
          is_debug=True,
          is_info=True),
)
page_shop_1 = Page(
    name='千岛湖游记店铺列表页面',
    fieldlist=fl_shop1,
    listcssselector=ListCssSelector(
        list_css_selector=
        'body > div.content.cf > div.main > div.search-content.cf > div > div.result > ul > li'
    ),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection),
    is_save=True)

fl_shop2 = Fieldlist(
    Field(
        fieldname=FieldName.SHOP_TAG,
        css_selector=
        'body > div.bgf2f2f2 > div.content.cf > div.ctd_main > div.ctd_main_body > div.ctd_content > div.ctd_content_controls.cf > div',
        is_focus=True,
        is_info=True),
    Field(
        fieldname=FieldName.SHOP_DETAIL,
        css_selector=
    Field(fieldname=FieldName.SHOP_FEATURE,
          css_selector='',
          is_info=True,
          filter_func=get_shop_feature),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          is_info=True,
          filter_func=get_shop_rate))

fl_shop2 = Fieldlist()

page_shop_1 = Page(
    name='途牛景点店铺列表页面',
    fieldlist=fl_shop1,
    listcssselector=ListCssSelector(
        list_css_selector=
        '#niuren_list > div.contentcontainer.clearfix > div.content_bottom > div.main.fl > div.thelist > ul > li',
    ),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection),
    is_save=True)
page_shop_2 = Page()
page_shop_2 = Page(
    name='途牛景点店铺详情页面',
    fieldlist=fl_shop2,
    tabsetup=TabSetup(click_css_selector=
                      ' div.theinfo.ticket.clearfix > a > dl > dt > p > span'),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection),
    is_save=True)

    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(1)',
          is_info=True),
    Field(fieldname=FieldName.SHOP_PHONE,
          css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(2)',
          is_info=True),
    Field(fieldname=FieldName.SHOP_AREA,
          css_selector='dl > dd > div.tourListLeftListMsg > span:nth-child(2)',
          filter_func=get_shop_area,
          is_info=True))

page_shop_1 = Page(
    name='大众点评餐饮店铺列表页面',
    fieldlist=fl_shop1,
    listcssselector=ListCssSelector(
        list_css_selector=
        'body > div.mainLayout.newsMainLayout > div.newsLeftLayout.sceneRightLayout > div'
    ),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection),
    is_save=True)

detail_fl_shop2 = Fieldlist(
    Field(
        fieldname=FieldName.SHOP_DES,
        css_selector=
        'body > div.mainLayout.newsMainLayout > div.newsLeftLayout.sceneRightLayout > div.newsDetailContent > div.newsDetailConNote > div',
        is_info=True), )

detail_shop_2 = Page(name='大众点评获取评论分数和数量页面',
                     fieldlist=detail_fl_shop2,
                     is_save=True)
Example #17
0
    for i in p('div.htl_info_table > table > tbody').items('tr'):
        item = (lambda x: x if x else '')(i.text()).split('\n')
        if len(item) >= 2:
            around.setdefault(item[0], (lambda x: x[1:] if len(x) >= 2 else [''])(item))
    return json.dumps(around, ensure_ascii=False)

fl_shop2 = Fieldlist(
    Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL,css_selector='#hotelRoomBox', attr='innerHTML', filter_func=get_recommend_all_room_dict, pause_time=1, is_focus=True),
    Field(fieldname=FieldName.SHOP_ROOM_FAVOURABLE,css_selector='#divDetailMain > div.htl_room_table',attr='innerHTML', filter_func=get_favourable_room, is_focus=True),
    Field(fieldname=FieldName.SHOP_INTRO, css_selector='#hotel_info_comment > div',attr='innerHTML', filter_func=get_hotel_intro, is_focus=True),
    Field(fieldname=FieldName.SHOP_PHONE, css_selector='#J_realContact', attr='data-real', regex='^([^<]*).*$', repl=r'\1', is_focus=True),
    Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#commentList > div.detail_cmt_box',attr='innerHTML',filter_func=get_shop_statistics, is_focus=True),
    Field(fieldname=FieldName.SHOP_AROUND_FACILITIES, css_selector='#hotel_info_comment > div', attr='innerHTML',filter_func=get_around_facilities, is_focus=True),
)

page_shop_1 = Page(name='携程酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#hotel_list > div.hotel_new_list', item_css_selector='ul.hotel_item'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection))

page_shop_2 = Page(name='携程酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='li.hotel_price_icon > div.action_info > p > a'), mongodb=Mongodb(db=TravelDriver.db,collection=TravelDriver.shop_collection), is_save=True)

fl_comment1 = Fieldlist(
    Field(fieldname=FieldName.COMMENT_USER_NAME, css_selector='div.user_info.J_ctrip_pop > p.name'),
    Field(fieldname=FieldName.COMMENT_TIME, css_selector='div.comment_main > div.comment_txt > div.comment_bar > p > span', regex=r'[^\d-]*'),
    Field(fieldname=FieldName.SHOP_NAME, css_selector='#J_htl_info > div.name > h2.cn_n', is_isolated=True),
    Field(fieldname=FieldName.COMMENT_CONTENT, css_selector='div.comment_main > div.comment_txt > div.J_commentDetail'),
    Field(fieldname=FieldName.COMMENT_USER_IMG, css_selector='div.user_info.J_ctrip_pop > p.head > span > img', attr='src'),
    Field(fieldname=FieldName.COMMENT_USER_CHECK_IN, css_selector='div.comment_main > p > span.date'),
    Field(fieldname=FieldName.COMMENT_USER_ROOM, css_selector='div.comment_main > p > a'),
    Field(fieldname=FieldName.COMMENT_TYPE, css_selector='div.comment_main > p > span.type'),
    Field(fieldname=FieldName.COMMENT_SCORE, css_selector='div.comment_main > p > span.score', regex=r'[^\d.]*'),
    Field(fieldname=FieldName.COMMENT_SCORE_TEXT, css_selector='div.comment_main > p > span.small_c', attr='data-value'),
    Field(fieldname=FieldName.COMMENT_USER_NUM, css_selector='div.user_info.J_ctrip_pop > p.num'),
Example #18
0
          css_selector='#booking-wrapper',
          attr='innerHTML',
          filter_func=get_shop_ticket,
          is_focus=True),
    Field(fieldname=FieldName.SHOP_INFO,
          css_selector=
          'div.main-bd > div.main-wrapper > div.clearfix > div.detail-left',
          attr='innerHTML',
          filter_func=get_shop_info,
          is_focus=True),
)

page_shop_1 = Page(name='携程景点店铺列表页面',
                   fieldlist=fl_shop1,
                   listcssselector=ListCssSelector(
                       list_css_selector='#searchResultContainer > div',
                       item_css_selector='div'),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection),
                   is_save=True)

page_shop_2 = Page(
    name='携程景点店铺详情页面',
    fieldlist=fl_shop2,
    tabsetup=TabSetup(click_css_selector='div.search_ticket_title > h2 > a'),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection),
    is_save=True)


def get_comment_user_name(self, _str):
    return _str.split(' ')[0]
    return _str;

fl_shop2 = Fieldlist(

#phoenix_dom_3_0 > div > div.head-wrapper.c-title.c-color.c-flexbox.c-line-bottom > div.left > span
#phoenix_dom_3_1 > div > div.head-wrapper.c-title.c-color.c-flexbox.c-line-bottom > div.left > span
Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='div.card-box.special2-box.c-container >div.head-wrapper.c-title.c-color.c-flexbox.c-line-bottom > div.left > span',is_info=True),
Field(fieldname=FieldName.SHOP_SCORE, css_selector='span.left-header-visit',is_info=True),
Field(fieldname=FieldName.SHOP_CATEGORY_NAME,css_selector='span.left-header-stdtag',is_info=True),
    Field(fieldname=FieldName.SHOP_PRICE,css_selector='span.left-header-reference-price',is_info=True),
    Field(fieldname=FieldName.SHOP_NAME_SEARCH_KEY,css_selector='div.generalHead-left-header-title > span',filter_func=get_shop_name,is_info=True),
Field(fieldname=FieldName.SHOP_PHONE,css_selector='#generalinfo > div.generalInfo-address-telnum > div.generalInfo-telnum.item > span.clampword.generalInfo-telnum-text',is_info=True)
#generalheader > div.generalHead-left-header.animation-common > div.generalHead-left-header-title > span
)
#card-1 > div > ul > li:nth-child(1) > div.cf.mb_5 > div.ml_30.mr_85 > div:nth-child(2)
page_shop_1 = Page(name='百度餐饮店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='ul.poilist > li'), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True)

page_shop_2 = Page(name='百度餐饮店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div.cf > div.ml_30 > div:nth-child(1) > span > a'),  mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection),is_save=True)


def get_shop_lng(self,_str):

    doc = _str.split(',')
    #再某一个经度范围内

    if(float(doc[0]) <= 119.243071 and float(doc[0]) >= 118.650908):

     return  doc[0]
    else:

     return 119.051491
Example #20
0
from spider.driver.base.mongodb import Mongodb
from spider.driver.base.tabsetup import TabSetup

fl_weixin1 = Fieldlist(
    Field(fieldname='public_name',
          css_selector='div > div.txt-box > p.tit > a',
          regex=r'[^\u4e00-\u9fa5]*'), )

fl_weixin2 = Fieldlist(
    Field(fieldname='article_name', css_selector='div > div > h4'),
    Field(fieldname='article_time',
          css_selector='div > div > p.weui_media_extra_info'),
)

page_weixin_1 = Page(name='微信公众号列表页面',
                     fieldlist=fl_weixin1,
                     listcssselector=ListCssSelector(
                         list_css_selector='#main > div.news-box > ul > li'))

page_weixin_2 = Page(
    name='微信公众号文章列表页面',
    fieldlist=fl_weixin2,
    tabsetup=TabSetup(click_css_selector='div > div.txt-box > p.tit > a'),
    listcssselector=ListCssSelector(list_css_selector='#history > div'))


class WeixinSpider(Driver):
    def __init__(self,
                 isheadless=False,
                 ismobile=False,
                 isvirtualdisplay=False,
                 spider_id='',
          css_selector='',
          filter_func=get_shop_feature,
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          filter_func=get_shop_rate,
          is_info=True),
    Field(fieldname=FieldName.SHOP_COMMENT_URL,
          css_selector='a',
          attr='href',
          filter_func=get_comment_url,
          is_info=True))
page_shop_1 = Page(name='驴妈妈景点店铺列表页面',
                   fieldlist=fl_shop1,
                   listcssselector=ListCssSelector(
                       list_css_selector='#ticket_searchListUl1 > li'),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection),
                   is_save=True)


def get_comment_time(self, _str):
    return _str[0:10]


def get_comment_grade(self, _str):
    print(_str)
    width = re.findall(r'[\d]{1,3}', _str)[0]
    print(float(width) / 150 * 5)
    return str(float(width) / 150 * 5)
Example #22
0
          is_info=True),
    Field(fieldname=FieldName.SHOP_GRADE,
          css_selector='div > div:nth-child(2) > ul > li:nth-child(1) > a > b',
          filter_func=get_shop_grade,
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='',
          filter_func=get_shop_rate,
          is_info=True),
)

fl_shop2 = Fieldlist()
page_shop_1 = Page(name='驴妈妈景点店铺列表页面',
                   fieldlist=fl_shop1,
                   listcssselector=ListCssSelector(
                       list_css_selector='#mainHotelLeft > div', ),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection),
                   is_save=True)
page_shop_2 = Page()
page_shop_2 = Page(name='驴妈妈景点店铺详情页面',
                   fieldlist=fl_shop2,
                   tabsetup=TabSetup(click_css_selector='dl > dt > a'),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection))


def get_comment_user_name(self, _str):
    return _str.split(' ')[0]

from selenium.webdriver.remote.webelement import WebElement

from spider.driver.base.tabsetup import TabSetup
from spider.driver.base.field import Field, Fieldlist
from spider.driver.base.page import Page, PageGroup
from spider.driver.base.listcssselector import ListCssSelector
from spider.driver.base.mongodb import Mongodb

fl = Fieldlist(Field(fieldname=12), Field(fieldname=13))
mongo = Mongodb(db='122', collection='12')
lcs = ListCssSelector(list_css_selector=12)
tab = TabSetup(url_name=12)
p = Page(name=122,
         fieldlist=fl,
         mongodb=mongo,
         listcssselector=lcs,
         tabsetup=tab)
p1 = Page(name=123,
          fieldlist=fl,
          mongodb=mongo,
          listcssselector=lcs,
          tabsetup=tab)
pg = PageGroup(p, p1)
print(next(pg))
        if len(star) == 2:
            stars.setdefault(star[0], re.sub(r'[^\d]*', '', star[1]))
    tag_star_dict.setdefault('star', stars)
    return json.dumps(tag_star_dict, ensure_ascii=False)

fl_shop2 = Fieldlist(
    Field(fieldname=FieldName.SHOP_GRADE, css_selector='#poi-detail > div.container > div.base-info > div.main-detail.clearfix > div.main-detail-right > div.hotel-appraise > div.hotel-scope > span', pause_time=5, is_focus=True),
    Field(fieldname=FieldName.SHOP_PHONE, css_selector='#poi-detail > div.container > div.base-info > div.main-detail.clearfix > div.main-detail-left > div.main-detail-left-top.clearfix > div.hotel-detail-info > div > div.call-info > div > span.call-number', is_focus=True),
    Field(fieldname=FieldName.SHOP_ADDRESS, css_selector='#poi-detail > div.container > div.base-info > div.main-detail.clearfix > div.main-detail-left > div.main-detail-left-top.clearfix > div.hotel-detail-price > div.hotel-address-box.clearfix > span.hotel-address', is_focus=True),
    Field(fieldname=FieldName.SHOP_ROOM_RECOMMEND_ALL, css_selector='#deal', attr='innerHTML', filter_func=get_shop_room_all, is_focus=True),
    Field(fieldname=FieldName.SHOP_INTRO, css_selector='#poi-detail > div.container > div.sub-content.clearfix > div.main > div> div.hotel-info', attr='innerHTML', filter_func=get_shop_intro, is_focus=True),
    Field(fieldname=FieldName.SHOP_STATISTICS, css_selector='#poi-detail > div.container > div.sub-content.clearfix > div.main > div.user-comment-info', attr='innerHTML', filter_func=get_shop_statistics, is_focus=True),
    Field(fieldname=FieldName.SHOP_COMMENT_NUM, css_selector='#comment > div > h2 > a > span.count', regex=r'[^\d]*'),
)

page_shop_1 = Page(name='大众点评酒店店铺列表页面', fieldlist=fl_shop1, listcssselector=ListCssSelector(list_css_selector='#poi-list > div.content-wrap > div > div.list-wrapper > div.content > ul > li',item_start=11,item_end=12), mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection))

page_shop_2 = Page(name='大众点评酒店店铺详情页面', fieldlist=fl_shop2, tabsetup=TabSetup(click_css_selector='div.hotel-info-ctn > div.hotel-info-main > h2 > a.hotel-name-link'),mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection), is_save=True)

def get_rate(self, _str):
    return str(int(re.sub('[^\d]*','',_str))/10)

def get_comment_rate_tag(self, _str):
    p = PyQuery(_str)
    tag_list = []
    for i in p('span.item').items():
        tag_list.append(i.text().strip())
    return json.dumps(tag_list, ensure_ascii=False)

def get_comment_content(self, _str):
    return PyQuery(_str).text().replace('收起评论','')
Example #25
0
          css_selector='',
          filter_func=_get_shop_comment_num,
          is_info=True),
    #无shop_feature
    Field(fieldname=FieldName.SHOP_FEATURE,
          css_selector='div:nth-child(2) > div:nth-child(2) > span',
          is_info=True),
    Field(fieldname=FieldName.SHOP_RATE,
          css_selector='div:nth-child(2) > div:nth-child(3) > span',
          is_info=True))

page_shop_1 = Page(name='飞猪景点店铺列表页面',
                   fieldlist=fl_shop1,
                   listcssselector=ListCssSelector(
                       list_css_selector='#tus-recycleview > div > div',
                       item_css_selector='div',
                       item_start=4),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection),
                   is_save=True)

fl_shop2 = Fieldlist(
    Field(
        fieldname=FieldName.SHOP_NAME,
        css_selector=
        'body > div > div.rax-scrollview > div > div:nth-child(1) > div > div:nth-child(1) > span'
    ),
    Field(
        fieldname=FieldName.SHOP_COMMENT_URL,
        css_selector=
        'body > div > div.rax-scrollview > div > div:nth-child(1) > div > div:nth-child(3) > div:nth-child(2)',
    Field(fieldname=FieldName.SHOP_FACILITIES,
          css_selector='#_j_hotel_info',
          attr='innerHTML',
          filter_func=get_shop_facilities,
          offset=6,
          try_times=10,
          pause_time=1),
    Field(fieldname=FieldName.SHOP_STATISTICS,
          css_selector='#_j_comment',
          attr='innerHTML',
          filter_func=get_shop_stattistics),
)

page_shop_1 = Page(
    name='马蜂窝酒店店铺列表页面',
    fieldlist=fl_shop1,
    listcssselector=ListCssSelector(list_css_selector='#_j_hotel_list > div'),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection))

page_shop_2 = Page(name='马蜂窝酒店店铺详情页面',
                   fieldlist=fl_shop2,
                   tabsetup=TabSetup(click_css_selector='div.hotel-pic > a'),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection),
                   is_save=True)


class MafengwoHotelSpider(TravelDriver):
    def get_shop_info(self):
        try:
            shop_data_list = self.from_page_get_data_list(page=page_shop_1)
Example #27
0
          filter_func=get_shop_rate),
    Field(fieldname=FieldName.SHOP_TAG,
          css_selector='div.txt > span.comment-list',
          attr='innerHTML',
          filter_func=get_shop_tag,
          pause_time=1),
    Field(fieldname=FieldName.SUBTYPE_NAME,
          css_selector='div.txt > div.tag-addr > a:nth-child(1)',
          filter_func=get_shop_subtype_name),
    Field(fieldname=FieldName.SHOP_ADDRESS,
          css_selector='div.txt > div.tag-addr > span.addr'))

page_shop_1 = Page(name='大众点评爱车店铺列表页面',
                   fieldlist=fl_shop1,
                   listcssselector=ListCssSelector(
                       list_css_selector='#shop-all-list > ul > li'),
                   mongodb=Mongodb(db=TravelDriver.db,
                                   collection=TravelDriver.shop_collection),
                   is_save=True)


def get_shop_time(self, _str):
    try:
        p = PyQuery(_str)
        shop_time = ''
        for i in p('p.info.info-indent').items():
            if '营业时间' in i.text():
                shop_time = i.text()
        return shop_time
    except Exception:
        return ''
    Field(
        fieldname=FieldName.SHOP_FEATURE,
        css_selector='div > div.sight_item_about > div > div.intro.color999'),
    Field(
        fieldname=FieldName.SHOP_GRADE,
        css_selector=
        'div > div.sight_item_about > div > div.clrfix > div > span.product_star_level > em > span',
        attr='style',
        filter_func=get_shop_grade,
        is_info=True),
)
fl_shop2 = Fieldlist()
page_shop_1 = Page(
    name='去哪儿景点店铺列表页面',
    fieldlist=fl_shop1,
    listcssselector=ListCssSelector(list_css_selector='#search-list > div'),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection),
    is_save=True)
page_shop_2 = ()

page_shop_2 = Page(
    name='去哪儿景点店铺详情页面',
    fieldlist=fl_shop2,
    tabsetup=TabSetup(
        click_css_selector='div > div.sight_item_about > h3 > a'),
    mongodb=Mongodb(db=TravelDriver.db,
                    collection=TravelDriver.shop_collection),
    is_save=True)