Beispiel #1
0
class WxGhost(object):
	def __init__(self):
		self.ghost = Ghost(log_level=logging.CRITICAL).start()
		self.ghost.download_images = False
		try:
		    self.ghost.load_cookies("cookie.txt")
		    print 'load cookie'
		except IOError:
			print 'load cookie error'
		self.ghost.show()

	def handle_frequency(self):
		if u"您的访问过于频繁" in self.ghost.content:
			print 'frequency'
			self.ghost.show()
			self.ghost.capture_to("seccode.png", selector="#seccodeImage")
			self.ghost.wait_for_text(u'以下内容来自微信公众号', timeout=1800)  # 输入验证码
			self.ghost.save_cookies("cookie.txt")

	def open(self, url):
		try:
			self.ghost.open(url)
			self.handle_frequency()
		except TimeoutError:
			print 'timeout when open'
			return False
		return True

	def evaluate(self, js, expect_loading=True):
		try:
			self.ghost.evaluate(js, expect_loading=expect_loading)
			self.handle_frequency()
		except TimeoutError:
			return False
		return True

	def sleep(self, value):
		self.ghost.sleep(value)

	def get_lxml(self):
		return lxml.html.fromstring(self.ghost.content)
Beispiel #2
0
class Crawler:
    def __init__(self,
                 location,
                 cookie_file=None,
                 mainwindow=None):
        self.mainwindow = mainwindow
        self.ghost = Ghost().start()
        self.ghost._confirm_expected = True  #
        self.ghost.wait_timeout = page_timeout
        self.ghost.download_images = False
        if cookie_file != '':
            try:
                self.ghost.load_cookies(cookie_file)
            except IOError:
                self.display("cookie: IOError", '<font color=red>$</font>', 'url')
        self.max_depth = 0
        self.url_queue = []
        self.location = location.split('?')[0]
        # dvwa_security(self.__ghost, 'low')

    def go(self):
        self.display("...crawling", "<b>$<b>", 'url')
        times = 0
        while True:
            try:
                self.ghost.open(self.location)
                current_url, resources = self.ghost.evaluate('window.location.href')  # redirect
                self.location = str(current_url)
                r = urlparse.urlparse(self.location)
                self.host = r.netloc  # slash(r.scheme + "://" + r.netloc)
                self.display(self.location,  "<a href='$'>$<a>", 'url')
                self.url_queue.append(self.location)
                break
            except TimeoutError:
                times = times + 1
            if times == 5:
                self.display("TimeoutError", '<font color=red>$</font>', 'url')
                self.exit()
        self.crawler_page(self.location, 0)  # url, depth
        # Test
        for url in self.url_queue:
            t = Test(self.ghost, url, self.mainwindow)
            t.test()
        self.exit()

    def crawler_page(self, location, depth):
        if depth >= self.max_depth:
            return
        try:
            self.ghost.open(location)
            current_url, resources = self.ghost.evaluate('window.location.href')  # redirect
            location = str(current_url)
        except TimeoutError:
            return
        urls = []
        soup = BeautifulSoup(str(self.ghost.content), from_encoding='utf-8')
        bs_as = soup.find_all('a')
        for a in bs_as:
            url = self.convert_a(location, a)
            if url:
                r = urlparse.urlparse(url)
                host = r.netloc  # slash(r.scheme + "://" + r.netloc)
                if host == self.host and url not in self.url_queue:
                    self.display(url,  "<a href='$'>$<a>", 'url')
                    self.url_queue.append(url)
                    urls.append(url)
        for url in urls:
            self.crawler_page(url, depth + 1)

    def display(self, content, format=None, widget=None):
        print content
        if self.mainwindow:
            self.mainwindow.display(content, format, widget)

    def convert_a(self, location, a):
        if str(type(a)) == "<class 'bs4.element.Tag'>":
            try:
                href = a['href']
            except KeyError:
                return None
        elif str(type(a)) == "<type 'str'>":
            href = a
        else:
            return None  # <type 'unicode'>
        href = href.strip()
        # useless
        if href.lower() in ['javascript:;', "javacript:void(0);", "javascript:void(0)", "javascript:void(0);",
                           'return false;', '/', "http://www", ""]:
            return None
        for s in ['mailto:', '#', 'javascript:']:
            if href.lower().startswith(s):
                return None
        # normal
        if href.startswith('http://') or href.startswith('https://'):
            return href
        # path
        if href.startswith("//"):
            href = "http:" + href  # //www.baidu.com/s
        elif href.startswith("/"):
            href = self.host + href[1:]
        else:
            href = slash(location) + href
        return href

    def exit(self):
        self.ghost.hide()
        if self.mainwindow:
            self.mainwindow.go_button.setEnabled(True)
            self.mainwindow.finish()
        else:
            print "Finish"
        self.ghost.sleep(120)