Example #1
0
def get_hotel(driver: WebDriver, city: str, n: int) -> None:
    driver.get('%s/%s/p%d' % (ROOT_URL, city, n))

    driver.implicitly_wait(1)

    hotel_list = driver.find_element_by_id('hotel_list')
    hotels = hotel_list.find_elements_by_class_name('searchresult_list')
    for hotel in hotels:
        hid = str(hotel.get_attribute('id'))
        if not re.match(r'^\d+$', hid):
            continue
        name = driver.find_element_by_xpath('//*[@id="%s"]/ul/li[2]/h2/a' %
                                            hid).get_attribute('title')
        try:
            points = hotel.find_element_by_class_name('hotel_value').text
        except Exception:
            continue
        start_price = hotel.find_element_by_class_name('J_price_lowList').text
        about_points = hotel.find_element_by_class_name('hotel_judgement').text
        points_count = RE_COMMENT.search(about_points).group()
        logging.info('%s\n%s\n%s\n%s\n%s\n%s\n%s\n' %
                     (city, hid, name, n, points, start_price, points_count))
        if Hotel.objects.filter(hid=hid).count() == 0:
            Hotel.objects.create(city=city,
                                 hid=hid,
                                 name=name,
                                 page=n,
                                 points=points,
                                 start_price=start_price,
                                 points_count=points_count)
Example #2
0
def get_driver():
    if settings.OPEN511_UI_TEST_BROWSER == 'phantomjs':
        from selenium.webdriver.phantomjs.webdriver import WebDriver
    else:
        from selenium.webdriver.firefox.webdriver import WebDriver

    driver = WebDriver()
    driver.implicitly_wait(5)
    driver.set_window_size(1000, 700)
    return driver
Example #3
0
def create_webdriver():
    wd = WebDriver()
    wd.implicitly_wait(60)
    wd.set_window_size(1400,1000)
    return wd
Example #4
0
def create_webdriver():
    wd = WebDriver()
    wd.implicitly_wait(60)
    wd.set_window_size(1400, 1000)
    return wd
Example #5
0
class HtmlURLUtil:
    """
        html请求工具类
        urllib:python核心库,一般只用于对url的处理,不用它提供的request请求
        tld(top level domain):强大的url域名处理工具,好吧,也不是很强大,但是用着方便
        selenium强大的,NB的web自动化测试工具
        phantomJS:无界面的webkit,一般使用它的request,好处,可以爬取搜索引擎的结果(benefit ajax)
    """
    __USER_AGENT = "Mozilla/5.0 (Macintosh; Intel Mac OS X 10_12_6) " \
                   "AppleWebKit/537.36 (KHTML, like Gecko) Chrome/63.0.3239.84 Safari/537.36"

    def __init__(self, driver=None):
        self.driver = driver
        self.headers = {
            'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,image/apng,*/*;q=0.8',
            'Accept-Language': 'zh-CN,zh;q=0.9,en;q=0.8',
            # 'Accept-Encoding': '*',
            'Cache-Control': 'max-age=0',
            'User-Agent': HtmlURLUtil.__USER_AGENT,
            'Connection': 'keep-alive',
            'Referer': 'https://www.baidu.com/'
        }

    def getHtml(self, url, referer="https://www.baidu.com/"):
        _result = ""
        try:
            my_dc = DesiredCapabilities.PHANTOMJS.copy()
            my_dc["browserName"] = "chrome"
            my_dc["platform"] = "mac"
            my_dc["version"] = "63.0.3239.84"
            my_dc["phantomjs.page.settings.loadImages"] = False
            my_dc["phantomjs.page.settings.userAgent"] = HtmlURLUtil.__USER_AGENT

            service_args = ["--load-images=false", "--disk-cache=false",
                            "--ignore-ssl-errors=true"]
            # "--webdriver-logfile=webdriver.log","--webdriver-loglevel=INFO"
            for head, value in self.headers.iteritems():
                my_dc["phantomjs.page.customHeaders.{}".format(head)] = value

            my_dc["phantomjs.page.customHeaders.Referer"] = referer
            self.driver = WebDriver(desired_capabilities=my_dc, service_args=service_args)
            self.driver.set_script_timeout(20)
            self.driver.set_page_load_timeout(30)
            self.driver.implicitly_wait(5)
            self.driver.set_window_size(2560, 1066)

            self.driver.get(url)
            # 保存网页快照图片
            # self.driver.save_screenshot(md5_util.md5(url)+".png")
            _result = self.driver.page_source
        except:
            log.getLogger().exception("HtmlURLUtil  getHtml error...")
            # self.driver.close()
            self.driver.quit()
        return _result

    def closeWebDriver(self):
        self.driver.quit()

    def getSortQS(self, url):
        """
        获取排序好的query string
        :param url:
        :return:
        """
        a = urllib.splitquery(url)
        if len(a) <= 1 or not a[1]:
            return None
        qs = urlparse.parse_qs(a[1])
        # 使用快速排序O(nlogn)
        return sort_util.fastSortDict(qs)

    def getTLD(self, url):
        """
        获取域名对象
        :param url:
        :return:
        """
        try:
            if not url:
                return None
            web = urllib.splitquery(url)[0]
            return tld.get_tld(web)
        except:
            log.getLogger().exception("getTLD ...%s" % url)
        return None

    def getMd5URL(self, url):
        """
        对url进行md5
        先对参数排序,然后进行md5
        :param url:
        :return:
        """
        web = urllib.splitquery(url)[0]
        string = web + str(self.getSortQS(url))
        return md5_util.md5(string)

    def getElementsByTagName(self, elname):
        return self.driver.find_elements_by_tag_name(elname)

    def writeWebContentToFile(self, webcontent, filepath):
        if not webcontent:
            return
        reload(sys)
        sys.setdefaultencoding("utf-8")
        try:
            _dir = os.path.dirname(filepath)
            if not os.path.exists(_dir):
                os.makedirs(_dir)
            f = open(filepath, "w")
            f.write(webcontent)
            f.flush()
        except:
            log.getLogger().exception("htmlutil writeWebContentToFile ...")
        finally:
            f.close()

    def getCharset(self, content):

        charset = "utf-8"
        m = re.compile('<meta .*(http-equiv="?Content-Type"?.*)?charset="?([a-zA-Z0-9_-]+)"?', re.I)\
            .search(content)
        if m and m.lastindex == 2:
            charset = m.group(2).lower()
        return charset
Example #6
0
def create_webdriver():
    wd = WebDriver('/home/travis/build/mapbender/mapbender-starter/application/bin/phantomjs')

    wd.set_window_size(1400,1000)
    wd.implicitly_wait(300)
    return wd
Example #7
0
def spider_comments(driver: WebDriver, hid: str, n: int) -> int:
    if Comment.objects.filter(hotel=hid).filter(page=n).count() == 15:
        return 0

    try:
        driver.get('%s/dianping/%s_p%dt0.html' % (ROOT_URL, hid, n))
        driver.implicitly_wait(0.5)
    except (ConnectionRefusedError, urllib.error.URLError,
            ConnectionResetError, TypeError, AttributeError):
        del driver
        return 403
    try:
        comment_list = driver.find_elements_by_css_selector(
            '#divCtripComment > div.comment_detail_list')[1]
    except IndexError:
        driver.implicitly_wait(5)
        try:
            comment_list = driver.find_elements_by_css_selector(
                '#divCtripComment > div.comment_detail_list')[1]
        except IndexError:
            comment_list = driver.find_element_by_css_selector(
                '#divCtripComment > div.comment_detail_list')

    if Hotel.objects.filter(hid=hid).count() == 1:
        hotel = Hotel.objects.get(hid=hid)
        if hotel.comments_count == 0:
            try:
                comment_text = driver.find_element_by_css_selector(
                    "#commentTab > a").text

                logging.warning("\n%s\n" % comment_text)
                hotel.comments_count = int(
                    RE_COMMENT.search(comment_text).group())
                logging.warning("\n%s\n" % hotel.comments_count)
                hotel.save()
            except Exception:
                pass

    comments = comment_list.find_elements_by_class_name('comment_block')

    for comment in comments:
        try:
            name = comment.find_element_by_class_name(
                'name').find_element_by_tag_name('span').text
            cid = comment.get_attribute('data-cid')
            points = comment.find_element_by_class_name('n').text
            room_type = comment.find_element_by_class_name('room_link').text
            content = comment.find_element_by_class_name(
                'J_commentDetail').text.strip()
        except Exception:
            continue
        logging.info('%s\n%s\n%s\n%s\n%s\n%s\n' %
                     (hid, name, n, room_type, points, content))

        # with sqlite3.connect('../../db.sqlite3') as conn:
        #     with conn.cursor() as cursor:
        #         cursor.execute("select * from get_data_comment where (cid=?)", (cid,))
        if Comment.objects.filter(cid=cid).count() == 0:
            Comment.objects.create(cid=cid,
                                   content=content,
                                   hotel=hid,
                                   page=n,
                                   points=points,
                                   room_type=room_type,
                                   name=name)

        elif not Comment.objects.filter(cid=cid).exclude(page=n).count() == 0:
            return 1

    del driver
    return 0