def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content("http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage(html,"http://www.leakedin.com/tag/emailpassword-dump/",None) for l in list: PyMongoUtil.write(l,[""]) print len(list)
def testGetUrl(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() html = GetWords.get_content( "http://www.leakedin.com/tag/emailpassword-dump/") list = UrlScan.scanpage( html, "http://www.leakedin.com/tag/emailpassword-dump/", None) for l in list: PyMongoUtil.write(l, [""]) print len(list)
def trytry(): PyMongoUtil.clean() MemcacheUtil.clean() SpiderBloomFilter() queue = PyPool.get_queue() lock = PyPool.get_lock() listener = MyListener() r = Regex("[a-z0-9\-\._]+@[a-z0-9\-\.]+\.[a-z]{2,4}[:,\|]*.*") s = SpiderStrategy("http://www.leakedin.com/tag/emailpassword-dump/", 2, is_out=False, pattern=None, mode=r) Spider(s).get_all_words(queue, lock) listener.listen(lock, queue) WordCount.calc_count() return
from SpiderUtils.bloomFilter import SpiderBloomFilter from SpiderUtils.spider import Spider from SpiderUtils.spiderStrategy import SpiderStrategy from Statics.wordCount import WordCount from Utils.logFactory import LogFactory from SpiderUtils.SpiderMode.regexMode import Regex from SpiderUtils.enums import Language from SpiderUtils.getWords import GetWords from PyIO.excelUtil import ExcelUtil from os import path logger = LogFactory.getlogger("main") # clean old data PyMongoUtil.clean() MemcacheUtil.clean() # create bloom filter SpiderBloomFilter() # multitask prepare queue = PyPool.get_queue() lock = PyPool.get_lock() listener = MyListener() def err(): print("please enter the right select") while True: