def get_words(self, url): html = GetWords.get_content(url) try: words = self.catch_words(html) wlist = [] for wd in words: wlist.extend(self.analyze(wd)) PyMongoUtil.write(url, wlist) except Exception, e: logger.error(url + " " + str(e))
def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content("http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage(html,"http://www.leakedin.com/tag/emailpassword-dump/",None) for l in list: PyMongoUtil.write(l,[""]) print len(list)
def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content( "http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage( html, "http://www.leakedin.com/tag/emailpassword-dump/", None) for l in list: PyMongoUtil.write(l, [""]) print len(list)
def catch_words(self, html): words = GetWords.get_chinese(html) return words
queue = PyPool.get_queue() lock = PyPool.get_lock() listener = MyListener() def err(): print("please enter the right select") while True: url = raw_input("Please input url:\n") print("checking url...") if not url.startswith("http"): url = "http://" + url try: statusCode = GetWords.try_connect(url) except Exception, e: print(str(e)) continue if statusCode != 200: print "cannot connect to the website" else: break while True: depth = raw_input("Please input depth:\n") if not depth.isdigit(): print("please enter a number\n") else: break
def catch_words(self, html): words = GetWords.get_by_regex(html, self.__pattern) return words
def catch_words(self, html): raw = HtmlUtil.filter_tags(html) words = GetWords.get_english(raw) return words
def catch_words(self, html): words = GetWords.get_korean(html) return words