def get_shop_comment(self): self.fast_new_page(url='http://www.baidu.com') shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='10.1.17.15').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_url'))) for i in range(len(shop_name_url_list)): self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) #第一次打开的时候进行验证 后面都不需要 while (True): self.is_ready_by_proxy_ip() self.switch_window_by_index(index=-1) self.deal_with_failure_page() self.fast_new_page(url=shop_name_url_list[i][1]) time.sleep(1) self.switch_window_by_index(index=-1) # 页面选择 if '验证中心' in self.driver.title: self.info_log(data='关闭验证页面!!!') self.close_curr_page() else: break self.until_click_no_next_page_by_css_selector( nextpagesetup=NextPageCssSelectorSetup( css_selector='#remark_page > a.page-next', stop_css_selector='#remark_page > a.page-next.hidden', main_pagefunc=PageFunc(func=self.from_page_get_data_list, page=page_comment_1))) self.close_curr_page()
def get_comment_info_list(self): #打开知道 self.fast_new_page('http://www.baidu.com') shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_comment_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_comment_url'))) for i in range(len(shop_name_url_list)): #可能会有反爬 self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) self.fast_new_page('http://www.baidu.com') self.fast_new_page(url=shop_name_url_list[i][1]) self.shop_name = shop_name_url_list[i][0] comment_data_list = self.from_page_get_data_list( page=page_comment_1) # for j in range(0,len(comment_data_list)): # comment_data_list[j]['shop_name'] = shop_name_url_list[i][0] # self.save_data_to_mongodb(fieldlist= fl_comment1, mongodb=Mongodb(db=TravelDriver.db, collection=TravelDriver.comments_collection), data=comment_data_list[j]) # print(comment_data_list) self.close_curr_page()
def get_comment_info_list(self): self.fast_new_page(url='http://www.baidu.com') shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_comment_url'))) for i in range(len(shop_name_url_list)): self.fast_new_page(url="https://www.baidu.com") # 可能会有反爬 self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) self.fast_new_page(url=shop_name_url_list[i][1]) self.shop_name = shop_name_url_list[i][0] time.sleep(5) self.until_click_no_next_page_by_partial_link_text( nextpagesetup=NextPageLinkTextSetup( link_text='下一页', main_pagefunc=PageFunc(func=self.from_page_get_data_list, page=page_comment_1), pause_time=2)) self.close_curr_page()
def get_shop_des(self): self.fast_new_page(url="http://www.baidu.com") shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): # if i.get('shop_url') and (i.get('shop_flag') == None or i.get("shop_flag") == "0" or i.get("shop_flag") == "" # # or( i.get("shop_comment_num") == 0 and i.get("shop_score") > 0) # ): shop_name_url_list.append((i.get('shop_name'), i.get('shop_url'))) for i in range(len(shop_name_url_list)): # self.fast_new_page(url='https://www.baidu.com'); self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) self.fast_new_page(url=shop_name_url_list[i][1], is_scroll_to_bottom=True) time.sleep(5) data = self.from_fieldlist_get_data(page=detail_shop_2) self.update_data_to_mongodb( shop_collcetion, self.merge_dict( self.get_data_key(), {FieldName.SHOP_URL: shop_name_url_list[i][1]}), data) self.close_curr_page()
def get_comment_list(self): self.fast_new_page(url="http://www.baidu.com") shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_url'))) for i in range(len(shop_name_url_list)): self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) self.shop_name = shop_name_url_list[i][0] self.fast_new_page("http://www.baidu.com") self.fast_new_page(url=shop_name_url_list[i][1]) self.until_click_no_next_page_by_css_selector( nextpagesetup=NextPageCssSelectorSetup( css_selector= '#allCmtComment > div.paging.orangestyle > div > a.nextpage', stop_css_selector= '#allCmtComment > div.paging.orangestyle > div > a.nextpage.hidden', main_pagefunc=PageFunc(func=self.from_page_get_data_list, page=page_comment_1), pause_time=2)) self.close_curr_page()
def get_shop_comment(self): shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='10.1.17.15').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_comment_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_comment_url'))) for i in range(len(shop_name_url_list)): self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) while (True): self.is_ready_by_proxy_ip() self.switch_window_by_index(index=-1) self.deal_with_failure_page() self.fast_new_page(url=shop_name_url_list[i][1]) time.sleep(1) self.switch_window_by_index(index=-1) # 页面选择 if '验证中心' in self.driver.title: self.info_log(data='关闭验证页面!!!') self.close_curr_page() else: break self.until_click_no_next_page_by_partial_link_text( nextpagesetup=NextPageLinkTextSetup( link_text='下一页', main_pagefunc=PageFunc(func=self.from_page_get_data_list, page=page_comment_1))) self.close_curr_page()
def get_shop_detail(self): shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='10.1.17.15').get_collection() shop_url_set = set() for i in shop_collcetion.find(self.get_data_key()): shop_url_set.add(i.get(FieldName.SHOP_URL)) count = 0 for url in shop_url_set: print(count) count += 1 while (True): self.is_ready_by_proxy_ip() self.switch_window_by_index(index=-1) self.deal_with_failure_page() self.fast_new_page(url=url) time.sleep(1) self.switch_window_by_index(index=-1) # 页面选择 if '验证中心' in self.driver.title: self.info_log(data='关闭验证页面!!!') self.close_curr_page() else: break data = self.from_fieldlist_get_data(page=page_shop_2) self.update_data_to_mongodb( shop_collcetion, self.merge_dict(self.get_data_key(), {FieldName.SHOP_URL: url}), data) self.close_curr_page()
def get_comment_info_list(self): shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_comment_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_comment_url'))) for i in range(len(shop_name_url_list)): # 可能会有反爬 self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) self.shop_name = shop_name_url_list[i][0] self.fast_new_page(url=shop_name_url_list[i][1]) time.sleep(5) # main-page > header > h2 > div:nth-child(2) try: #查看是否有顶部按钮 有就点击 dianping = self.driver.find_element_by_css_selector( css_selector='#main-page > header > h2 > div:nth-child(2)') self.fast_click_same_page_by_css_selector( click_css_selector= '#main-page > header > h2 > div:nth-child(2)') time.sleep(6) except Exception as e: print(111) #点击最新的 try: new = self.driver.find_element_by_xpath( '//li[@data-tagtype="44"]') ActionChains(self.driver).click(new).perform() time.sleep(5) except Exception as e: print(222) #向下进行滚动 try: button = self.driver.find_element_by_css_selector( css_selector= '#main-page > div.mp-comment-mpcon > div.mp-addcomment.mp-border-top > a > div' ) Action = TouchActions(self.driver) Action.scroll_from_element(on_element=button, xoffset=0, yoffset=int(8000)).perform() time.sleep(5) except Exception as e: print(333) self.fast_click_same_page_by_css_selector( click_css_selector='#main-page > div.mp-gotop > div') time.sleep(6) comment_data_list = self.from_page_get_data_list( page=page_comment_1)
def get_shop_detail(self): shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_url_set = set() for i in shop_collcetion.find(self.get_data_key()): shop_url_set.add(i.get(FieldName.SHOP_URL)) for url in shop_url_set: self.fast_new_page(url=url) time.sleep(5) data = self.from_fieldlist_get_data(page=page_shop_2) self.update_data_to_mongodb( shop_collcetion, self.merge_dict(self.get_data_key(), {FieldName.SHOP_URL: url}), data) self.close_curr_page()
def get_comment_info_list(self): shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_comment_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_comment_url'))) for i in range(len(shop_name_url_list)): # 可能会有反爬 self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) self.fast_new_page(url=shop_name_url_list[i][1]) time.sleep(5) comment_data_list = self.from_page_get_data_list( page=page_comment_1) self.close_curr_page()
def get_comment_info_list(self): shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_url'): shop_name_url_list.append((i.get('shop_name'), i.get('shop_url'))) for i in range(len(shop_name_url_list)): self.fast_new_page(url="https://www.baidu.com"); # 可能会有反爬 self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) self.fast_new_page(url=shop_name_url_list[i][1]) self.shop_name = shop_name_url_list[i][0]; time.sleep(5) self.driver.refresh() self.fast_click_same_page_by_css_selector(click_css_selector='#detail-placeholder > div.main > div.tabs-box > div:nth-child(1) > ul > li:nth-child(3)') comment_data_list = self.from_page_get_data_list(page=page_comment_1) self.close_curr_page()
def get_shop_comment(self): shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='10.1.17.15').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_comment_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_comment_url'))) # for i in range(len(shop_name_url_list)): # self.info_log(data='第%s个,%s'%(i+1, shop_name_url_list[i][0])) # while (True): # self.is_ready_by_proxy_ip() # self.switch_window_by_index(index=-1) # self.deal_with_failure_page() # self.fast_new_page(url=shop_name_url_list[i][1]) # time.sleep(1) # self.switch_window_by_index(index=-1) # 页面选择 # if '验证中心' in self.driver.title: # self.info_log(data='关闭验证页面!!!') # self.close_curr_page() # else: # break # dianping = self.driver.find_element_by_css_selector( # css_selector='#review-list > div.review-list-container > div.review-list-main > div.reviews-wrapper > div.reviews-tags > div.content') # self.fast_click_same_page_by_css_selector(click_css_selector='#main-page > header > h2 > div:nth-child(2)') self.fast_new_page( url='http://www.dianping.com/shop/110833569/review_all') span = '#review-list > div.review-list-container > div.review-list-main > div.reviews-wrapper > div.reviews-tags > div.content > span:nth-child(' + str( 1) + ')' links = self.driver.find_elements_by_css_selector( span)[0].get_attribute('text') self.until_click_no_next_page_by_partial_link_text( nextpagesetup=NextPageLinkTextSetup( link_text='下一页', main_pagefunc=PageFunc(func=self.from_page_get_data_list, page=page_comment_1))) self.close_curr_page()
def get_shop_comment(self): self.fast_new_page(url='http://www.baidu.com'); shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, ).get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_comment_url'): shop_name_url_list.append((i.get('shop_name'),i.get('shop_comment_url'))) for i in range(len(shop_name_url_list)): self.info_log(data='第%s个,%s'%(i+1, shop_name_url_list[i][0])) self.fast_new_page(url=shop_name_url_list[i][1],is_scroll_to_bottom=False) time.sleep(3) self.driver.find_element_by_link_text(link_text='默认排序').click(); time.sleep(2) self.driver.find_element_by_link_text(link_text='最新点评').click(); time.sleep(5) # while (True): # self.is_ready_by_proxy_ip() # self.switch_window_by_index(index=-1) # self.deal_with_failure_page() # self.fast_new_page(url=shop_name_url_list[i][1]) # time.sleep(1) # self.switch_window_by_index(index=-1) # 页面选择 # if '验证中心' in self.driver.title: # self.info_log(data='关闭验证页面!!!') # self.close_curr_page() # else: # break self.until_click_no_next_page_by_css_selector(nextpagesetup=NextPageCssSelectorSetup( css_selector='#review-list > div.review-list-container > div.review-list-main > div.reviews-wrapper > div.bottom-area.clearfix > div > a.NextPage', stop_css_selector='#review-list > div.review-list-container > div.review-list-main > div.reviews-wrapper > div.bottom-area.clearfix > div > a.NextPage.hidden', main_pagefunc=PageFunc( func=self.from_page_get_data_list, page=page_comment_1), pause_time=5)) self.close_curr_page()
def get_comment_list(self): self.fast_new_page(url="http://www.baidu.com") shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_url'))) for i in range(len(shop_name_url_list)): self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) # while (True): # self.is_ready_by_proxy_ip() # self.switch_window_by_index(index=-1) # self.deal_with_failure_page() # self.fast_new_page(url=shop_name_url_list[i][1]) # time.sleep(1) # self.switch_window_by_index(index=-1) # 页面选择 # if '请求数据错误' in self.driver.title: # self.info_log(data='关闭验证页面!!!') # self.close_curr_page() # else: # break self.fast_new_page(url=shop_name_url_list[i][1]) self.until_click_no_next_page_by_css_selector( nextpagesetup=NextPageCssSelectorSetup( css_selector= '#allCmtComment > div.paging.orangestyle > div > a.nextpage', stop_css_selector= '#allCmtComment > div.paging.orangestyle > div > a.nextpage.hidden', main_pagefunc=PageFunc(func=self.from_page_get_data_list, page=page_comment_1), pause_time=5)) self.close_curr_page()
def get_shop_detial(self): self.fast_new_page(url="http://www.baidu.com") shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_url'))) for i in range(len(shop_name_url_list)): self.fast_new_page(url='http://www.baidu.com') self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) # while (True): # self.is_ready_by_proxy_ip() # self.switch_window_by_index(index=-1) # self.deal_with_failure_page() # self.fast_new_page(url=shop_name_url_list[i][1]) # time.sleep(1) # self.switch_window_by_index(index=-1) # 页面选择 # if '请求数据错误' in self.driver.title: # self.info_log(data='关闭验证页面!!!') # self.close_curr_page() # else: # break self.fast_new_page(url=shop_name_url_list[i][1]) self.fast_click_first_item_same_page_by_partial_link_text( link_text='只看文字') data = self.from_fieldlist_get_data(page=page_shop_2) self.update_data_to_mongodb( shop_collcetion, self.merge_dict( self.get_data_key(), {FieldName.SHOP_URL: shop_name_url_list[i][1]}), data) self.close_curr_page()
def CommentsResults(request): """ 从数据库获取评论结果 :param request: :return: """ id = request.GET['id'] shop_name = request.GET[FieldName.SHOP_NAME] project = Project.objects.get(id=id) comments = Mongodb( host=TravelDriver.host, port=TravelDriver.port, db=TravelDriver.db, collection=TravelDriver.comments_collection).get_collection() thead_list = list() tbody_list = list() comments_data_list = list( comments.find({ FieldName.DATA_WEBSITE: str(project.data_website), FieldName.DATA_REGION: str(project.data_region), FieldName.DATA_SOURCE: str(project.data_source), FieldName.SHOP_NAME: shop_name, })) for comment_data in comments_data_list: comment_data.pop(FieldName.ID_) comment_data.pop(FieldName.DATA_WEBSITE) comment_data.pop(FieldName.DATA_REGION) comment_data.pop(FieldName.DATA_SOURCE) comment_data.pop(FieldName.SHOP_NAME) thead_list.extend(comment_data.keys()) thead_list = list(set(thead_list)) for thead in thead_list: if 'shop' in thead: thead_list.remove(thead) for comment_data in comments_data_list: td_list = list() for key in thead_list: if key not in comment_data: td_list.append(['nonexistent', 'nonexistent']) else: if not comment_data.get(key): if FIELD_NAME_TYPE.get( key) == FieldType.FLOAT or FIELD_NAME_TYPE.get( key) == FieldType.INT: td_list.append(['0', '0']) else: td_list.append(['null', 'null']) else: value1 = comment_data.get(key) value = value1 if isinstance(comment_data.get(key), list): #如果是照片列表 value = json.dumps(value)[:10] try: int(value) except: if len(value) > 10: value = value[0:10] + u'...' td_list.append([value, value1]) tbody_list.append(td_list) thead_chinese_list = list() for thead in thead_list: thead_chinese_list.append(FIELD_NAME_ZH.get(thead)) return render(request, 'spider/commentresults.html', context={ 'thead_list': thead_chinese_list, 'tbody_list': tbody_list })
def ShopResults(request): """ 从数据库获取店铺结果 :param request: :return: """ id = request.GET['id'] project = Project.objects.get(id=id) shops = Mongodb(host=TravelDriver.host, port=TravelDriver.port, db=TravelDriver.db, collection=TravelDriver.shop_collection).get_collection() thead_list = list() tbody_list = list() shops_data_list = list( shops.find({ FieldName.DATA_WEBSITE: str(project.data_website), FieldName.DATA_REGION: str(project.data_region), FieldName.DATA_SOURCE: str(project.data_source), })) for shop_data in shops_data_list: shop_data.pop(FieldName.ID_) shop_data.pop(FieldName.DATA_WEBSITE) shop_data.pop(FieldName.DATA_REGION) shop_data.pop(FieldName.DATA_SOURCE) thead_list.extend(shop_data.keys()) thead_list = list(set(thead_list)) if FieldName.SHOP_NAME in thead_list: thead_list.remove(FieldName.SHOP_NAME) thead_list.insert(0, FieldName.SHOP_NAME) if FieldName.SHOP_COMMENT_NUM in thead_list: thead_list.remove(FieldName.SHOP_COMMENT_NUM) thead_list.insert(1, FieldName.SHOP_COMMENT_NUM) thead_tuple_list = list() #第一个中文,第二个是英文的元组列表 for thead in thead_list: thead_tuple_list.append((FIELD_NAME_ZH.get(thead), thead)) for shop_data in shops_data_list: td_list = [{ FieldName.SHOP_NAME: shop_data.get(FieldName.SHOP_NAME), 'id': id }] for key in thead_list[1:]: if key not in shop_data: td_list.append(['nonexistent', 'nonexistent']) else: if not shop_data.get(key): if FIELD_NAME_TYPE.get( key) == FieldType.FLOAT or FIELD_NAME_TYPE.get( key) == FieldType.INT: td_list.append(['0', '0']) else: td_list.append(['null', 'null']) else: value_complete = shop_data.get(key) #未做修改的value value = value_complete if isinstance(shop_data.get(key), list): #如果是照片列表,就把照片列表转换成unicode字符串 value = json.dumps(value)[:10] try: int(value) #如果是一个数字字符串则不进行字符串的切割检测 except: if len(value) > 10: value = value[0:10] + u'...' td_list.append([value, value_complete]) tbody_list.append(td_list) return render(request, 'spider/shopresults.html', context={ 'thead_tuple_list': thead_tuple_list, 'tbody_list': tbody_list, })
def get_shop_address(self): #self.fast_new_page(url="http://www.baidu.com"); self.fast_new_page( url='http://api.map.baidu.com/lbsapi/getpoint/index.html') self.fast_click_first_item_same_page_by_partial_link_text( link_text='更换城市') self.until_scroll_to_center_send_text_by_css_selector( css_selector='#selCityInput', text='淳安') time.sleep(2) self.fast_click_same_page_by_css_selector( click_css_selector='#selCityButton') time.sleep(2) shop_collcetion = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='localhost').get_collection() shop_name_url_list = list() for i in shop_collcetion.find(self.get_data_key()): if i.get('shop_url'): shop_name_url_list.append( (i.get('shop_name'), i.get('shop_url'))) for i in range(len(shop_name_url_list)): #self.fast_new_page(url='https://www.baidu.com'); self.info_log(data='第%s个,%s' % (i + 1, shop_name_url_list[i][0])) self.fast_click_first_item_same_page_by_partial_link_text( link_text='更换城市') self.until_scroll_to_center_send_text_by_css_selector( css_selector='#selCityInput', text='淳安') time.sleep(2) self.fast_click_same_page_by_css_selector( click_css_selector='#selCityButton') time.sleep(2) # while (True): # self.is_ready_by_proxy_ip() # self.switch_window_by_index(index=-1) # self.deal_with_failure_page() # self.fast_new_page(url=shop_name_url_list[i][1]) # time.sleep(1) # self.switch_window_by_index(index=-1) # 页面选择 # if '请求数据错误' in self.driver.title: # self.info_log(data='关闭验证页面!!!') # self.close_curr_page() # else: # break if ('千岛湖' in shop_name_url_list[i][0]): self.until_scroll_to_center_send_text_by_css_selector( css_selector='#localvalue', text=shop_name_url_list[i][0]) else: self.until_scroll_to_center_send_text_by_css_selector( css_selector='#localvalue', text='千岛湖' + shop_name_url_list[i][0]) time.sleep(2) self.fast_click_same_page_by_css_selector( click_css_selector='#localsearch') time.sleep(2) try: self.driver.find_element_by_css_selector( css_selector='#no_0 > a').click() time.sleep(2) # while (True): # self.is_ready_by_proxy_ip() # self.switch_window_by_index(index=-1) # self.deal_with_failure_page() # self.fast_new_page(url=shop_name_url_list[i][1]) # time.sleep(1) # self.switch_window_by_index(index=-1) # 页面选择 # if '请求数据错误' in self.driver.title: # self.info_log(data='关闭验证页面!!!') # self.close_curr_page() # else: # break data = self.from_fieldlist_get_data(page=page_shop_2) self.update_data_to_mongodb( shop_collcetion, self.merge_dict( self.get_data_key(), {FieldName.SHOP_URL: shop_name_url_list[i][1]}), data) except Exception: print("改地址无经纬度")
from spider.driver.base.field import * from spider.driver.travel.core.traveldriver import WebsiteName, DataSourceName, TravelDriver from spider.driver.base.mongodb import Mongodb shops = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='127.0.0.1').get_collection() comments = Mongodb( db=TravelDriver.db, collection=TravelDriver.comments_collection).get_collection() key = { FieldName.DATA_SOURCE: DataSourceName.HOTEL, FieldName.DATA_WEBSITE: WebsiteName.XIECHENG, FieldName.DATA_REGION: '千岛湖', } for i in shops.find(key, {'shop_url': 1}): print(i)
# -*- coding:utf-8 -*- from spider.driver.base.field import * from spider.driver.travel.core.traveldriver import WebsiteName, DataSourceName, TravelDriver from spider.driver.base.mongodb import Mongodb shops = Mongodb(db=TravelDriver.db, collection=TravelDriver.shop_collection, host='10.1.17.15').get_collection() comments = Mongodb( db=TravelDriver.db, collection=TravelDriver.comments_collection).get_collection() key = { FieldName.DATA_SOURCE: DataSourceName.HOTEL, FieldName.DATA_WEBSITE: WebsiteName.QUNAR, FieldName.DATA_REGION: '千岛湖', } shop_name_list = [] for i in shops.find(key): shop_name_list.append(i.get(FieldName.SHOP_NAME)) print(len(shop_name_list)) print(len(set(shop_name_list)))