def get_hotel(driver: WebDriver, city: str, n: int) -> None: driver.get('%s/%s/p%d' % (ROOT_URL, city, n)) driver.implicitly_wait(1) hotel_list = driver.find_element_by_id('hotel_list') hotels = hotel_list.find_elements_by_class_name('searchresult_list') for hotel in hotels: hid = str(hotel.get_attribute('id')) if not re.match(r'^\d+$', hid): continue name = driver.find_element_by_xpath('//*[@id="%s"]/ul/li[2]/h2/a' % hid).get_attribute('title') try: points = hotel.find_element_by_class_name('hotel_value').text except Exception: continue start_price = hotel.find_element_by_class_name('J_price_lowList').text about_points = hotel.find_element_by_class_name('hotel_judgement').text points_count = RE_COMMENT.search(about_points).group() logging.info('%s\n%s\n%s\n%s\n%s\n%s\n%s\n' % (city, hid, name, n, points, start_price, points_count)) if Hotel.objects.filter(hid=hid).count() == 0: Hotel.objects.create(city=city, hid=hid, name=name, page=n, points=points, start_price=start_price, points_count=points_count)
def get_driver(): if settings.OPEN511_UI_TEST_BROWSER == 'phantomjs': from selenium.webdriver.phantomjs.webdriver import WebDriver else: from selenium.webdriver.firefox.webdriver import WebDriver driver = WebDriver() driver.implicitly_wait(5) driver.set_window_size(1000, 700) return driver
def create_webdriver(): wd = WebDriver() wd.implicitly_wait(60) wd.set_window_size(1400,1000) return wd
def create_webdriver(): wd = WebDriver() wd.implicitly_wait(60) wd.set_window_size(1400, 1000) return wd
class HtmlURLUtil: """ html请求工具类 urllib:python核心库,一般只用于对url的处理,不用它提供的request请求 tld(top level domain):强大的url域名处理工具,好吧,也不是很强大,但是用着方便 selenium强大的,NB的web自动化测试工具 phantomJS:无界面的webkit,一般使用它的request,好处,可以爬取搜索引擎的结果(benefit ajax) """ __USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) " \ "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36" def __init__(self, driver=None): self.driver = driver self.headers = { 'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8', 'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8', # 'Accept-Encoding': '*', 'Cache-Control': 'max-age=0', 'User-Agent': HtmlURLUtil.__USER_AGENT, 'Connection': 'keep-alive', 'Referer': 'https://www.baidu.com/' } def getHtml(self, url, referer="https://www.baidu.com/"): _result = "" try: my_dc = DesiredCapabilities.PHANTOMJS.copy() my_dc["browserName"] = "chrome" my_dc["platform"] = "mac" my_dc["version"] = "63.0.3239.84" my_dc["phantomjs.page.settings.loadImages"] = False my_dc["phantomjs.page.settings.userAgent"] = HtmlURLUtil.__USER_AGENT service_args = ["--load-images=false", "--disk-cache=false", "--ignore-ssl-errors=true"] # "--webdriver-logfile=webdriver.log","--webdriver-loglevel=INFO" for head, value in self.headers.iteritems(): my_dc["phantomjs.page.customHeaders.{}".format(head)] = value my_dc["phantomjs.page.customHeaders.Referer"] = referer self.driver = WebDriver(desired_capabilities=my_dc, service_args=service_args) self.driver.set_script_timeout(20) self.driver.set_page_load_timeout(30) self.driver.implicitly_wait(5) self.driver.set_window_size(2560, 1066) self.driver.get(url) # 保存网页快照图片 # self.driver.save_screenshot(md5_util.md5(url)+".png") _result = self.driver.page_source except: log.getLogger().exception("HtmlURLUtil getHtml error...") # self.driver.close() self.driver.quit() return _result def closeWebDriver(self): self.driver.quit() def getSortQS(self, url): """ 获取排序好的query string :param url: :return: """ a = urllib.splitquery(url) if len(a) <= 1 or not a[1]: return None qs = urlparse.parse_qs(a[1]) # 使用快速排序O(nlogn) return sort_util.fastSortDict(qs) def getTLD(self, url): """ 获取域名对象 :param url: :return: """ try: if not url: return None web = urllib.splitquery(url)[0] return tld.get_tld(web) except: log.getLogger().exception("getTLD ...%s" % url) return None def getMd5URL(self, url): """ 对url进行md5 先对参数排序,然后进行md5 :param url: :return: """ web = urllib.splitquery(url)[0] string = web + str(self.getSortQS(url)) return md5_util.md5(string) def getElementsByTagName(self, elname): return self.driver.find_elements_by_tag_name(elname) def writeWebContentToFile(self, webcontent, filepath): if not webcontent: return reload(sys) sys.setdefaultencoding("utf-8") try: _dir = os.path.dirname(filepath) if not os.path.exists(_dir): os.makedirs(_dir) f = open(filepath, "w") f.write(webcontent) f.flush() except: log.getLogger().exception("htmlutil writeWebContentToFile ...") finally: f.close() def getCharset(self, content): charset = "utf-8" m = re.compile('<meta .*(http-equiv="?Content-Type"?.*)?charset="?([a-zA-Z0-9_-]+)"?', re.I)\ .search(content) if m and m.lastindex == 2: charset = m.group(2).lower() return charset
def create_webdriver(): wd = WebDriver('/home/travis/build/mapbender/mapbender-starter/application/bin/phantomjs') wd.set_window_size(1400,1000) wd.implicitly_wait(300) return wd
def spider_comments(driver: WebDriver, hid: str, n: int) -> int: if Comment.objects.filter(hotel=hid).filter(page=n).count() == 15: return 0 try: driver.get('%s/dianping/%s_p%dt0.html' % (ROOT_URL, hid, n)) driver.implicitly_wait(0.5) except (ConnectionRefusedError, urllib.error.URLError, ConnectionResetError, TypeError, AttributeError): del driver return 403 try: comment_list = driver.find_elements_by_css_selector( '#divCtripComment > div.comment_detail_list')[1] except IndexError: driver.implicitly_wait(5) try: comment_list = driver.find_elements_by_css_selector( '#divCtripComment > div.comment_detail_list')[1] except IndexError: comment_list = driver.find_element_by_css_selector( '#divCtripComment > div.comment_detail_list') if Hotel.objects.filter(hid=hid).count() == 1: hotel = Hotel.objects.get(hid=hid) if hotel.comments_count == 0: try: comment_text = driver.find_element_by_css_selector( "#commentTab > a").text logging.warning("\n%s\n" % comment_text) hotel.comments_count = int( RE_COMMENT.search(comment_text).group()) logging.warning("\n%s\n" % hotel.comments_count) hotel.save() except Exception: pass comments = comment_list.find_elements_by_class_name('comment_block') for comment in comments: try: name = comment.find_element_by_class_name( 'name').find_element_by_tag_name('span').text cid = comment.get_attribute('data-cid') points = comment.find_element_by_class_name('n').text room_type = comment.find_element_by_class_name('room_link').text content = comment.find_element_by_class_name( 'J_commentDetail').text.strip() except Exception: continue logging.info('%s\n%s\n%s\n%s\n%s\n%s\n' % (hid, name, n, room_type, points, content)) # with sqlite3.connect('../../db.sqlite3') as conn: # with conn.cursor() as cursor: # cursor.execute("select * from get_data_comment where (cid=?)", (cid,)) if Comment.objects.filter(cid=cid).count() == 0: Comment.objects.create(cid=cid, content=content, hotel=hid, page=n, points=points, room_type=room_type, name=name) elif not Comment.objects.filter(cid=cid).exclude(page=n).count() == 0: return 1 del driver return 0