def get_words(self, url): html = GetWords.get_content(url) try: words = self.catch_words(html) wlist = [] for wd in words: wlist.extend(self.analyze(wd)) PyMongoUtil.write(url, wlist) except Exception, e: logger.error(url + " " + str(e))
def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content("http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage(html,"http://www.leakedin.com/tag/emailpassword-dump/",None) for l in list: PyMongoUtil.write(l,[""]) print len(list)
def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content( "http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage( html, "http://www.leakedin.com/tag/emailpassword-dump/", None) for l in list: PyMongoUtil.write(l, [""]) print len(list)