def Principal(): thread = ThreadPool(10) while True: cnx, end = s.accept() print "o seguinte endereço se conectou: " + end[0] clientes.append(cnx) thread.insert_job(novo_cliente, cnx, end)
def main(): try: f = open(r'ip.txt', 'rb') ip = '' for line in f.readlines(): final_ip = line.strip('\n') for i in get_ip_list(final_ip): print i ip += str(i).strip() + '\n' with open(r'scan_ip.txt', 'w') as ff: ff.write(ip) data = [] items = portscan() # 进行masscan跑端口 dataList = {} for i in items: i = i.split('|') if i[1] not in dataList: dataList[str(i[1])] = [] dataList[str(i[1])].append(i[0]) for i in dataList: if len(dataList[i]) >= 50: for port in dataList[i]: items.remove(str(port) + '|' + str(i)) # 删除超过50个端口的 pool = ThreadPool(20, 1000) pool.start( NmapScan, items, data, ) except Exception as e: print e pass
def __init__(self, start_url, thread_num, post_list_path, max_post_num = 1000): """ `group_id` 待抓取的group id `thread_num` 抓取的线程 `post_list_path` 保存所有的post id list的文件路径 """ #线程池,指定线程数 self.thread_pool = ThreadPool(thread_num) # 保存topic的线程 # NOTE: 这里只允许一个保存进程,因为要操作同一个文件 self.save_thread = ThreadPool(1) # 保存group相关信息 self.post_list_path = post_list_path # 已经访问的页面: Group id ==> True or False self.visited_href = set() #待访问的小组讨论页面 self.unvisited_href = deque() # 访问失败的页面链接 self.failed_href = set() self.start_url = start_url # 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取 # 只保存thread-id self.post_list = list() self.is_crawling = False # 每个Group抓取的最大topic个数 self.MAX_POST_NUM = max_post_num
def __init__(self, args=Strategy()): self.url = args.url self.max_depth = args.max_depth #指定网页深度 self.max_count = args.max_count #爬行最大数量 self.concurrency = args.concurrency #线程数 self.timeout = args.timeout #超时 self.cookies = args.cookies #cookies self.ssl_verify = args.ssl_verify #ssl self.same_host = args.same_host #是否只抓取相同host的链接 self.same_domain = args.same_domain #是否只抓取相同domain的链接 self.currentDepth = 1 #标注初始爬虫深度,从1开始 self.keyword = args.keyword #指定关键词,使用console的默认编码来解码 self.threadPool = ThreadPool(args.concurrency) #线程池,指定线程数 self.visitedHrefs = set() #已访问的链接 self.unvisitedHrefs = deque() #待访问的链接 self.unvisitedHrefs.append(args.url) #添加首个待访问的链接 self.isCrawling = False #标记爬虫是否开始执行任务 self.file = BASEDIR + '/cache/crawler/' + genFilename( self.url) + '.txt' # print self.file # print 'args.url=\t',args.url ################# #此句有问题 self.database = Database(args.dbFile) #数据库 # print 'hehe' self.lock = Lock()
class FileService(): def __init__(self, ip, port, node_list, directory, timeout): self.check_file_service = CheckFileService( ip=ip, port=3001, nodes=node_list, timeout=timeout, directory=directory, ) self.file_transfer_service = FileTransferService(ip=ip, port=3002, directory=directory) self.directory = directory self.threadpool = ThreadPool(2) def start_service(self): self.threadpool.add_task(self.check_file_service.start_server, ('', 0)) self.threadpool.wait_completion() def get_file(self, file_path): fastest_node = self.check_file_service.process_get_file_request( file_path=file_path) requested_file = self.file_transfer_service.get_file_from_node( node_ip=fastest_node["node"], node_port=fastest_node["node_port"], file_path=file_path) self.save_file(requested_file) def save_file(self, file_path, file_to_be_saved): with open(file_path) as f: f.write(file_to_be_saved)
def saveProxies(self): #创建线程30个,并开启线程 threadPool = ThreadPool(30) threadPool.startThreads() #调用类 读取数据 #databases = database.DatabaseProxyIp() proxyip = self.proxyip_db.readData() #x循环读取数据进行匹配 for proxy in proxyip: #把测试函数放入线程中 threadPool.putTask(self.checkclientUrl, proxy[0]) #threadPool.putTask(self.checkProxy, proxy[0]) #flag,proxy = checkProxy(proxy[0]) #循环获取测试结果,成功写入数据库,失败修改available为0 ip_fail = 0 ip_ok = 0 ip_lock = 0 while threadPool.getTaskLeft(): flag, proxy = threadPool.getTaskResult() print flag, proxy if flag == 'ok': #print 'ok ', proxy self.proxyip_db.updateData(1, proxy) ip_ok = ip_ok + 1 elif flag == 'lock': self.proxyip_db.updateData(0, proxy) ip_lock = ip_lock + 1 else: self.proxyip_db.delData(proxy) ip_fail = ip_fail + 1 print '====> available ip: ', ip_ok, ' , lock ip: ', ip_lock, ' , fail ip: ', ip_fail, ' <====' threadPool.stopThreads()
def __init__(self, pool, maxHostID, monitorInterval=2): self._messageTypes = {} # Save arguments self._stop = False self._stopped = False self._poolID = str(pool.spUUID) self._spmStorageDir = pool.storage_repository tpSize = config.getint('irs', 'thread_pool_size') / 2 waitTimeout = 3 maxTasks = config.getint('irs', 'max_tasks') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) # *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice # versa *** # self._inbox = os.path.join(self._spmStorageDir, self._poolID, "mastersd", sd.DOMAIN_META_DATA, "inbox") if not os.path.exists(self._inbox): self.log.error("SPM_MailMonitor create failed - inbox %s does not " "exist" % repr(self._inbox)) raise RuntimeError("SPM_MailMonitor create failed - inbox %s does " "not exist" % repr(self._inbox)) self._outbox = os.path.join(self._spmStorageDir, self._poolID, "mastersd", sd.DOMAIN_META_DATA, "outbox") if not os.path.exists(self._outbox): self.log.error("SPM_MailMonitor create failed - outbox %s does " "not exist" % repr(self._outbox)) raise RuntimeError("SPM_MailMonitor create failed - outbox %s " "does not exist" % repr(self._outbox)) self._numHosts = int(maxHostID) self._outMailLen = MAILBOX_SIZE * self._numHosts self._monitorInterval = monitorInterval # TODO: add support for multiple paths (multiple mailboxes) self._outgoingMail = self._outMailLen * "\0" self._incomingMail = self._outgoingMail self._inCmd = ['dd', 'if=' + str(self._inbox), 'iflag=direct,fullblock', 'count=1' ] self._outCmd = ['dd', 'of=' + str(self._outbox), 'oflag=direct', 'iflag=fullblock', 'conv=notrunc', 'count=1' ] self._outLock = threading.Lock() self._inLock = threading.Lock() # Clear outgoing mail self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: " "%s", self._outCmd) cmd = self._outCmd + ['bs=' + str(self._outMailLen)] (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail) if rc: self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, " "dd failed") t = concurrent.thread(self.run, name="mailbox.SPMMonitor", logger=self.log.name) t.start() self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)
def __init__(self, ip, port, initial_nodes, period): super(DiscoveryService, self).__init__(name='Discovery', ip=ip, port=port) self.period = period self.nodes = initial_nodes self.threadpool = ThreadPool(2)
def __init__(self, inbox, outbox, hostID, queue, monitorInterval): # Save arguments tpSize = config.getint('irs', 'thread_pool_size') / 2 waitTimeout = 3 maxTasks = config.getint('irs', 'max_tasks') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) self._stop = False self._flush = False self._queue = queue self._activeMessages = {} self._monitorInterval = monitorInterval self._hostID = int(hostID) self._used_slots_array = [0] * MESSAGES_PER_MAILBOX self._outgoingMail = EMPTYMAILBOX self._incomingMail = EMPTYMAILBOX # TODO: add support for multiple paths (multiple mailboxes) self._spmStorageDir = config.get('irs', 'repository') self._inCmd = [ constants.EXT_DD, 'if=' + str(inbox), 'iflag=direct,fullblock', 'bs=' + str(BLOCK_SIZE), 'count=' + str(BLOCKS_PER_MAILBOX), 'skip=' + str(self._hostID * BLOCKS_PER_MAILBOX) ] self._outCmd = [ constants.EXT_DD, 'of=' + str(outbox), 'iflag=fullblock', 'oflag=direct', 'conv=notrunc', 'bs=' + str(BLOCK_SIZE), 'seek=' + str(self._hostID * BLOCKS_PER_MAILBOX) ] self._init = False self._initMailbox() # Read initial mailbox state self._msgCounter = 0 self._sendMail() # Clear outgoing mailbox self._thread = concurrent.thread(self.run, name="mailbox/hsm", logger=self.log.name) self._thread.start()
def __init__(self, tpSize=config.getfloat('irs', 'thread_pool_size'), waitTimeout=3, maxTasks=config.getfloat('irs', 'max_tasks')): self.storage_repository = config.get('irs', 'repository') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) self._tasks = {} self._unqueuedTasks = []
def __init__(self, args): self.depth = args.depth self.currentDepth = 1 self.database = database(args.dbFile) self.threadPool = ThreadPool(args.threadNum) self.visitUrls = set() self.unvisitedUrls = deque() self.unvisitedUrls.append(args.url) self.isCrawling = False self.maxWebPages = args.maxWebPages
def __init__(self, args): self.depth = args.depth self.currentDepth = 1 self.keyword = args.keyword.decode(getdefaultlocale()[1]) self.database = Database(args.dbFile) self.threadPool = ThreadPool(args.threadNum) self.visitedHrefs = set() self.unvisitedHrefs = deque() self.unvisitedHrefs.append(args.url) self.isCrawling = False
def __init__(self, ip, port, nodes, timeout, directory, file_transfer_service): super(CheckFileService, self).__init__(name='CheckFile', ip=ip, port=port) self.nodes = nodes self.directory = directory self.client_socket.settimeout(timeout) self.threadpool = ThreadPool(2) self.file_transfer_service = file_transfer_service
def clientThreadMain(): ''' Cria-se 20 threads pre-alocadas''' thread = ThreadPool(20) ''' Laco principal do servidor ''' while True: conexao, endereco = server.accept() print endereco[0] + " conectou!" ''' Quando um cliente se conecta, eh adicionado a uma lista de clientes (usado para o broadcast) ''' clientes.append(conexao) thread.insert_job(newClient, conexao, endereco)
def __init__(self, ip, port, node_list, directory, timeout): self.check_file_service = CheckFileService( ip=ip, port=3001, nodes=node_list, timeout=timeout, directory=directory, ) self.file_transfer_service = FileTransferService(ip=ip, port=3002, directory=directory) self.directory = directory self.threadpool = ThreadPool(2)
def __init__(self, groupID, topicIDList, threadNum, topic_info_path, comment_info_path): """ `groupID` 当前的Group id `topicIDList` 需要抓取的topic id的list `threadNum` 开启的线程数目 `topic_info_path` 存储topic信息的文件 `comment_info_path` 存储comment信息的文件 """ #线程池,指定线程数 self.threadPool = ThreadPool(threadNum) # 写数据库的线程 #self.DBThread = ThreadPool(1) # 保证同时只有一个线程在写文件 self.saveThread = ThreadPool(1) self.database = Database("DoubanGroup.db") #self.database = Database("test.db") self.topic_info_path = topic_info_path self.comment_info_path = comment_info_path # 已经访问的页面: Group id ==> True or False self.visitedHref = set() # 抓取失败的topic id self.failed = set() # 依次为每个小组抽取topic评论 self.groupID = groupID self.topicIDList = topicIDList # 等待抓取的topic列表 # 存储结果 # topic ID ==> Topic对象 self.topicDict = dict() # 存放下一个处理的评论页数: topic ID ==> 1,2,3... self.nextPage = dict() # 已经抓取完毕的topic id集合 self.finished = set() self.visitedHref = set() # 已经访问的网页 self.isCrawling = False # 每个topic抓取的最多comments个数 #self.MAX_COMMETS_NUM = 5000 self.MAX_COMMETS_NUM = float('inf') # 每页的评论数量 self.COMMENTS_PER_PAGE = 100
class TaskManager: log = logging.getLogger('TaskManager') def __init__(self, tpSize=config.getfloat('irs', 'thread_pool_size'), waitTimeout=3, maxTasks=config.getfloat('irs', 'max_tasks')): self.storage_repository = config.get('irs', 'repository') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) self._tasks = {} self._unqueuedTasks = [] def queue(self, task): return self._queueTask(task, task.commit) def queueRecovery(self, task): return self._queueTask(task, task.recover) def _queueTask(self, task, method): try: self.log.debug("queueing task: %s", task.id) self._tasks[task.id] = task if not self.tp.queueTask(task.id, method): self.log.error("unable to queue task: %s", task.dumpTask()) del self._tasks[task.id] raise se.AddTaskError() self.log.debug("task queued: %s", task.id) except Exception, ex: self.log.error("Could not queue task, encountered: %s", str(ex)) raise return task.id
def __init__(self, url, depth, threadNum, dbfile, key): #瑕佽幏鍙杣rl鐨勯槦鍒� self.urlQueue = Queue() #璇诲彇鐨刪tml闃熷垪 self.htmlQueue = Queue() #宸茬粡璁块棶鐨剈rl self.readUrls = [] #鏈闂殑閾炬帴 self.links = [] #绾跨▼鏁� self.threadNum = threadNum #鏁版嵁搴撴枃浠跺悕 self.dbfile = dbfile #鍒涘缓瀛樺偍鏁版嵁搴撳璞� self.dataBase = SaveDataBase(self.dbfile) #鎸囩偣绾跨▼鏁扮洰鐨勭嚎绋嬫睜 self.threadPool = ThreadPool(self.threadNum) #鍒濆鍖杣rl闃熷垪 self.urlQueue.put(url) #鍏抽敭瀛�浣跨敤console鐨勯粯璁ょ紪鐮佹潵瑙g爜 self.key = key.decode(getdefaultlocale()[1]) #鐖娣卞害 self.depth = depth #褰撳墠鐖娣卞害 self.currentDepth = 1 #褰撳墠绋嬪簭杩愯鐘舵� self.state = False
def __init__(self, args=Strategy()): self.url = args.url self.max_depth = args.max_depth #指定网页深度 self.max_count = args.max_count #爬行最大数量 self.concurrency = args.concurrency #线程数 self.timeout = args.timeout #超时 self.cookies = args.cookies #cookies self.ssl_verify = args.ssl_verify #ssl self.same_host = args.same_host #是否只抓取相同host的链接 self.same_domain = args.same_domain #是否只抓取相同domain的链接 self.currentDepth = 1 #标注初始爬虫深度,从1开始 self.keyword = args.keyword #指定关键词,使用console的默认编码来解码 self.threadPool = ThreadPool(args.concurrency) #线程池,指定线程数 self.visitedHrefs = set() #已访问的链接 self.unvisitedHrefs = deque() #待访问的链接 self.unvisitedHrefs.append(args.url)#添加首个待访问的链接 self.isCrawling = False #标记爬虫是否开始执行任务 self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt' print self.file print 'args.url=\t',args.url ################# #此句有问题 self.database = Database(args.dbFile) #数据库 # print 'hehe' self.lock = Lock()
def __init__(self, url, depth, threadNum, dbfile, key): #要获取url的队列 self.urlQueue = Queue() #读取的html队列 self.htmlQueue = Queue() #已经访问的url self.readUrls = [] #未访问的链接 self.links = [] #线程数 self.threadNum = threadNum #数据库文件名 self.dbfile = dbfile #创建存储数据库对象 self.dataBase = SaveDataBase(self.dbfile) #指点线程数目的线程池 self.threadPool = ThreadPool(self.threadNum) #初始化url队列 self.urlQueue.put(url) #关键字,使用console的默认编码来解码 self.key = key.decode(getdefaultlocale()[1]) #爬行深度 self.depth = depth #当前爬行深度 self.currentDepth = 1 #当前程序运行状态 self.state = False
def __init__(self, args, queue): threading.Thread.__init__(self) #指定网页深度 self.depth = args['depth'] #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 self.keyword = args['keyword'].decode(getdefaultlocale()[1]) #数据库 self.database = Database(db="bt_tornado") #线程池,指定线程数 self.threadPool = ThreadPool(args['threadNum']) #已访问的链接 self.visitedHrefs = set() #待访问的链接 self.unvisitedHrefs = deque() #添加待访问的链接 for url in args['url']: self.unvisitedHrefs.append(url) #标记爬虫是否开始执行任务 self.isCrawling = False # allow or deny crawl url self.entryFilter = args['entryFilter'] # allow to output back url self.yieldFilter = args['yieldFilter'] # self.callbackFilter = args['callbackFilter'] # self.db = args['db'] self.collection = args['collection'] # communication queue self.queue = queue
def __init__(self, args): # 抓取深度 self.max_deepth = args['deepth'] # 指定当前深度 self.current_deepth = 1 # 线程管理 self.threadPool = ThreadPool(args['threads']) # 指定存取数据库文件 self.dbfile = args['dbfile'] # 指定关键字 self.keyword = args['keyword'] # 是否自测 self.testself = args['testself'] # 当前层待访问的链接,用集合来去重 self.unvisitedUrl = set() self.unvisitedUrl.add(args['url']) # 已访问的链接 self.visitedUrl = set() self.q = Queue() # http header self.header = { 'Accetpt': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8', 'Accetpt-Encoding': 'gzip,deflate,sdch', 'Connection': 'keep-alive', 'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36' } # 连接数据库 self.connDB() self.isRunning = True
def __init__(self, tpSize=config.getint('irs', 'thread_pool_size'), waitTimeout=3, maxTasks=config.getint('irs', 'max_tasks')): self.storage_repository = config.get('irs', 'repository') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) self._tasks = {} self._unqueuedTasks = []
def __init__(self, group_id, topic_id_list, thread_num, base_path, topic_info_path, comment_info_path): """ `group_id` 当前的Group id `topic_id_list` 需要抓取的topic id的list `thread_num` 开启的线程数目 `topic_info_path` 存储topic信息的文件 `comment_info_path` 存储comment信息的文件 """ #线程池,指定线程数 self.thread_pool = ThreadPool(thread_num) # 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储 self.save_thread = ThreadPool(10) self.topic_info_path = topic_info_path self.comment_info_path = comment_info_path self.base_path = base_path # 已经访问的页面: Group id ==> True or False self.visited_href = set() # 抓取失败的topic id self.failed = set() # 依次为每个小组抽取topic评论 self.group_id = group_id self.topic_id_list = topic_id_list # 等待抓取的topic列表 # 存储结果 # topic ID ==> Topic对象 self.topic_dict = dict() # 存放下一个处理的评论页数: topic ID ==> 1,2,3... self.next_page = dict() # 已经抓取完毕的topic id集合 self.finished = set() self.is_crawling = False # 每个topic抓取的最多comments个数 #self.MAX_COMMETS_NUM = 5000 self.MAX_COMMETS_NUM = float('inf') # 每页的评论数量 self.COMMENTS_PER_PAGE = 100
def __init__(self, args): #指定网页深度 self.depth = args.depth #表示爬虫深度,从1开始 self.currentDepth = 1 #数据库 self.database = Database(args.dbFile) #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已经访问的链接 self.visitedHrefs = set() #待访问的页面 self.unvisitedHrefs = deque() #首个待访问的页面 self.url = args.url self.unvisitedHrefs.append(args.url) #标记爬虫是否开始执行 self.isCrawling = False
def testThreadPool(self): allTheThreads = [] with ThreadPool( 10 ) as tp: for i in range(200): w = MockWorker( None, None, None, None, f"Thread {i}" ) allTheThreads.append( w ) tp.addWorker( w ) for thread in allTheThreads: self.assertFalse( thread.is_alive() )
def __init__(self,url,threadnum,limit): #self.database = Database('pichref.sql') self.file = PicFile('imgfile','a') self.threadPool = ThreadPool(threadnum) self.unaccesshref = deque()#双向列表 self.accessedhref = set()#已访问的链接集合 self.unaccesshref.append(url)#添加初始链接 self.limit = limit self.picUrlCount = 1
def __init__(self,threadnum,pathname,limit): '''limit指定图片数目,path指定存放路径''' super(Crawler, self).__init__() self.threadPool = ThreadPool(threadnum) self.file = PicFile('imgfile','r') self.urlqueue = deque() self.count = 1 self._makePath(pathname) self.savaPath = os.getcwd()+'/'+pathname self._getUrl(limit)
def __init__( self, tpSize=config.getfloat("irs", "thread_pool_size"), waitTimeout=3, maxTasks=config.getfloat("irs", "max_tasks"), ): self.storage_repository = config.get("irs", "repository") self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) self._tasks = {} self._unqueuedTasks = []
def __init__(self, args): #指定网页深度 self.depth = args.depth #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 self.keyword = args.keyword.decode(getdefaultlocale()[1]) #数据库 self.database = Database() #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已访问的链接 self.visitedHrefs = set() #待访问的链接 self.unvisitedHrefs = deque() #添加首个待访问的链接 self.unvisitedHrefs.append(args.url) #标记爬虫是否开始执行任务 self.isCrawling = False
class FileTransferService(TCPService): def __init__(self, ip, port, directory): super(FileTransferService, self).__init__(name='FileTransfer', ip=ip, port=port) self.directory = directory self.threadpool = ThreadPool(2) def start_service(self): self.threadpool.add_task(self.start_server, ('', 0)) self.threadpool.wait_completion() def process_server_response(self, message, address): needed_file = open(self.directory + '/' + file_path) def get_server_port(self): return self.server_socket.getsockname()[1] def get_file_from_node(self, node_ip, node_port, file_path): pass
def __init__(self, group_id, thread_num, group_info_path, topic_list_path, max_topics_num = 1000): """ `group_id` 待抓取的group id `thread_num` 抓取的线程 `group_info_path` 存储group本身的信息文件路径 `topic_list_path` 保存所有的topic id list的文件路径 """ #线程池,指定线程数 self.thread_pool = ThreadPool(thread_num) # 保存topic的线程 self.save_thread = ThreadPool(1) # 写数据库的线程 #self.DBThread = ThreadPool(1) # 保存group相关信息 self.group_info_path = group_info_path self.topic_list_path = topic_list_path # 已经访问的页面: Group id ==> True or False self.visited_href = set() #待访问的小组讨论页面 self.unvisited_href = deque() # 访问失败的页面链接 self.failed_href = set() self.lock = Lock() #线程锁 self.group_id = group_id self.group_info = None # models.Group # 抓取结束有两种可能:1)抓取到的topic数目已经最大;2)已经将所有的topic全部抓取 # 只保存topic id self.topic_list = list() self.is_crawling = False # self.database = Database("DoubanGroup.db") # 每个Group抓取的最大topic个数 self.MAX_TOPICS_NUM = max_topics_num
def __init__(self, dbName, threadNum, logLevel, startUrls, depth, keyword, downloadMode): self.__threadNum = threadNum self.__startUrls = startUrls self.__depth = depth self.__keyword = keyword self.__downloadMode = downloadMode self.__dbName = dbName self.__logLevel = logLevel self.__exitEvent = threading.Event() # url队列存储待下载的url节点 self.__urlQueue = Queue.Queue() # html队列存储已经下载完成等待解析的html节点 self.__htmlQueue = Queue.Queue() # data队列存储已解析完成并符合存入数据库条件的html节点 self.__dataQueue = Queue.Queue() # 存储为各个下载模块分配的下载队列 self.__downloadQueueList = [] # 创建线程池 self.__threadPool = ThreadPool(threadNum + 2) self.__downloadingFlag = 0
def saveProxies(): threadPool = ThreadPool(30) threadPool.startThreads() proxyFileOK = open('proxyOK.txt', 'a') proxyFileFail = open('proxyFail.txt', 'a') for proxy in proxiex: threadPool.putTask(checkProxy, proxy) while threadPool.getTaskLeft(): flag, proxy = threadPool.getTaskResult() print flag, proxy if flag == 'ok': proxyFileOK.write(proxy) proxyFileOK.write('\n') else: proxyFileFail.write(proxy) proxyFileFail.write('\n') threadPool.stopThreads() proxyFileOK.close() proxyFileFail.close()
def __init__(self, section_id, post_id_list, crawler_thread_num, save_thread_num, post_base_path): """ `section_id` 天涯的板块名称 `post_id_list` 需要抓取的post id的list `thread_num` 开启的线程数目 post_base_path: 存储抓取结果的基本目录,每个post一个文件,并以该post的ID命名 """ # 抓取网页的线程池,指定线程数 self.thread_pool = ThreadPool(crawler_thread_num) # 由于现在是将不同的topic信息保存到不同的文件中,所以可以同时存储 self.save_thread = ThreadPool(save_thread_num) # 保存抓取信息的base path self.base_path = post_base_path # 已经访问的页面: Group id ==> True or False self.visited_href = set() self.visited_post = set() # 已经添加访问的页面的id集合 self.finished = set() # 已经抓取完毕的topic id集合 # 抓取失败的topic id self.failed = set() # 依次为每个小组抽取topic评论 self.section_id = section_id self.post_id_list = post_id_list # 等待抓取的topic列表 self.current_post_id_list = list(post_id_list) # 用于逐步向任务列表中加入post id # 存储结果 # topic ID ==> Topic对象 self.post_dict = dict() # 存放下一个处理的评论页数: topic ID ==> 1,2,3... self.next_page = dict() self.is_crawling = False # 每个topic抓取的最多comments个数 #self.MAX_COMMETS_NUM = 1000 self.MAX_COMMETS_NUM = float('inf')
def main(): node_list = ["0.0.0.0", "localhost"] main_pool = ThreadPool(3) discovery_service = DiscoveryService(ip="127.0.0.1", port=3000, initial_nodes=node_list, period=5) file_service = FileService(ip="127.0.0.1", port=3001, node_list=node_list, directory='files/', timeout=5) main_pool.add_task(discovery_service.start_service) main_pool.add_task(file_service.start_service) main_pool.wait_completion()
def saveProxies(): threadPool = ThreadPool(30) threadPool.startThreads() proxyFileOK = open('proxyOK.txt','a') proxyFileFail = open('proxyFail.txt','a') for proxy in proxiex: threadPool.putTask(checkProxy, proxy) while threadPool.getTaskLeft(): flag, proxy = threadPool.getTaskResult() print flag, proxy if flag == 'ok': proxyFileOK.write(proxy) proxyFileOK.write('\n') else: proxyFileFail.write(proxy) proxyFileFail.write('\n') threadPool.stopThreads() proxyFileOK.close() proxyFileFail.close()
class Crawler(object): def __init__(self,threadnum,pathname,limit): '''limit指定图片数目,path指定存放路径''' super(Crawler, self).__init__() self.threadPool = ThreadPool(threadnum) self.file = PicFile('imgfile','r') self.urlqueue = deque() self.count = 1 self._makePath(pathname) self.savaPath = os.getcwd()+'/'+pathname self._getUrl(limit) '''当前目录下创建指定目录''' def _makePath(self,pathname): if not os.path.isdir(os.getcwd()+'/'+pathname): os.mkdir(os.getcwd()+'/'+pathname) else: pass '''从文件取出 URL 到双向列表''' def _getUrl(self,num): while len(self.urlqueue) < num: self.urlqueue.append(self.file.getData().rstrip('\n')) self.file.close() def start(self): print '---start downloading picture---' self.threadPool.startThreads() while self.urlqueue!=deque([]): self.threadPool.putTask(self._handleTask,self.urlqueue.popleft()) self.stop() def stop(self): self.threadPool.stopThreads() print '---end downloading picture---' '''任务处理''' def _handleTask(self,url): self._download(url) '''下载图片,以数字升序命名''' def _download(self,url): retry = 2 try: r = requests.get(url) with open(self.savaPath +'/'+str(self.count)+'.jpg','wb') as jpg: jpg.write(r.content) self.count+=1 print url except Exception,e: if retry > 0: retry = retry - 1 self._download(url)
def start(self): with ThreadPool( self.max_jobs ) as tp : for url_to_visit in self.urls_provider : if not self.exclusions.isExcluded( url_to_visit ) : logging.info( f"visiting url {url_to_visit.value}..." ) try: self._waitUntilWorkingHour() w = Worker( self.user_agent, self.sentenceProcessor, self.urlProcessor, self.webSiteInfoProvider, self.MINIMUM_WORDS_PER_SENTENCE, url_to_visit.value ) tp.addWorker( w ) except Exception as ex: logging.error( f"Error fetching url {url_to_visit.value}") logging.error( ex )
def __init__(self, args): #指定网页深度 self.depth = args.depth #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 self.keyword = args.keyword.decode(getdefaultlocale()[1]) #数据库 self.database = Database(args.dbFile) #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已访问的链接 self.visitedHrefs = set() #待访问的链接 self.unvisitedHrefs = deque() #添加首个待访问的链接 self.unvisitedHrefs.append(args.url) #标记爬虫是否开始执行任务 self.isCrawling = False
def main(): threadPool = ThreadPool(5) threadPool.startThreads() f = codecs.open('tables/TopicInfo-all.txt', 'r', 'utf-8') # 读入unicode字符 count = 0 for line in f: line = line.strip() seg_list = line.split('[=]') if seg_list[1] == 'ustv': threadPool.putTask(task_handler, seg_list[0], seg_list) count += 1 f.close() while threadPool.getTaskLeft() > 0: time.sleep(10) print 'Waiting to finish. Task left: %d' % threadPool.getTaskLeft() log.info('Number of topics in ustv: %d' % count)
def __init__(self, inbox, outbox, hostID, queue, monitorInterval): # Save arguments tpSize = config.getint('irs', 'thread_pool_size') / 2 waitTimeout = 3 maxTasks = config.getint('irs', 'max_tasks') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) self._stop = False self._flush = False self._queue = queue self._activeMessages = {} self._monitorInterval = monitorInterval self._hostID = int(hostID) self._used_slots_array = [0] * MESSAGES_PER_MAILBOX self._outgoingMail = EMPTYMAILBOX self._incomingMail = EMPTYMAILBOX # TODO: add support for multiple paths (multiple mailboxes) self._spmStorageDir = config.get('irs', 'repository') self._inCmd = [constants.EXT_DD, 'if=' + str(inbox), 'iflag=direct,fullblock', 'bs=' + str(BLOCK_SIZE), 'count=' + str(BLOCKS_PER_MAILBOX), 'skip=' + str(self._hostID * BLOCKS_PER_MAILBOX) ] self._outCmd = [constants.EXT_DD, 'of=' + str(outbox), 'iflag=fullblock', 'oflag=direct', 'conv=notrunc', 'bs=' + str(BLOCK_SIZE), 'seek=' + str(self._hostID * BLOCKS_PER_MAILBOX) ] self._init = False self._initMailbox() # Read initial mailbox state self._msgCounter = 0 self._sendMail() # Clear outgoing mailbox threading.Thread.__init__(self) self.daemon = True self.name = "mailbox.HSMMonitor" self.start()
def __init__(self, args, startURLs): #指定网页深度 self.depth = args.depth #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 #self.keyword = args.keyword.decode(getdefaultlocale()[1]) #数据库 self.database = Database(args.dbFile) # store group ids to fils, using UTF-8 self.groupfile = codecs.open("GroupID.txt", "w", "UTF-8") #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已访问的小组id self.visitedGroups = set() #待访问的小组id self.unvisitedGroups = deque() # 所有的Group信息 self.groupInfo = [] self.lock = Lock() #线程锁 #标记爬虫是否开始执行任务 self.isCrawling = False # 添加尚未访问的小组首页 for url in startURLs: match_obj = REGroup.match(url) print "Add start urls:", url assert(match_obj != None) self.unvisitedGroups.append(match_obj.group(1)) # 一分钟内允许的最大访问次数 self.MAX_VISITS_PER_MINUTE = 10 # 当前周期内已经访问的网页数量 self.currentPeriodVisits = 0 # 将一分钟当作一个访问周期,记录当前周期的开始时间 self.periodStart = time.time() # 使用当前时间初始化
def __init__(self, args): #指定网页深度 self.depth = args.depth #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 self.keyword = args.keyword.decode(getdefaultlocale()[1]) #数据库 self.database = Database(args.dbFile) #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已访问的链接 self.visitedHrefs = set() #待访问的链接 self.unvisitedHrefs = deque() #添加首个待访问的链接 #self.unvisitedHrefs.append(args.url) #标记爬虫是否开始执行任务 self.isCrawling = False self.domainPattern = re.compile(r"^([0-9a-zA-Z][0-9a-zA-Z-]{0,62}\.)+([0-9a-zA-Z][0-9a-zA-Z-]{0,62})\.?$") self.maxDomainSeeds = args.maxDomainSeeds self._initDomainSeedsList(args.domainSeeds)
def subscriber_join_verify( self, num_subscribers = 10, num_channels = 1, channel_start = 0, cbs = None, port_list = []): self.test_status = False self.num_subscribers = num_subscribers self.subscriber_load(create = True, num = num_subscribers, num_channels = num_channels, channel_start = channel_start, port_list = port_list) self.onos_aaa_load() self.thread_pool = ThreadPool(min(100, self.num_subscribers), queue_size=1, wait_timeout=1) chan_leave = False #for single channel, multiple subscribers if cbs is None: cbs = (self.tls_verify, self.dhcp_verify, self.igmp_verify, self.traffic_verify) chan_leave = True for subscriber in self.subscriber_list: subscriber.start() pool_object = subscriber_pool(subscriber, cbs) self.thread_pool.addTask(pool_object.pool_cb) self.thread_pool.cleanUpThreads() for subscriber in self.subscriber_list: subscriber.stop() if chan_leave is True: subscriber.channel_leave(0) self.num_subscribers = 0 return self.test_status
class DiscoveryService(UDPService): def __init__(self, ip, port, initial_nodes, period): super(DiscoveryService, self).__init__(name='Discovery', ip=ip, port=port) self.period = period self.nodes = initial_nodes self.threadpool = ThreadPool(2) def start_service(self): self.threadpool.add_task(self.discovery_job) self.threadpool.add_task(self.start_server) self.threadpool.wait_completion() def discovery_job(self): while True: self.send_nodes_to_others() sleep(self.period) def update_nodes(self, discovered_list): for discovered_node in discovered_list: if (discovered_node not in self.nodes and discovered_node is not self.ip): self.nodes.append(discovered_node) def process_server_response(self, message, address): self.update_nodes(ast.literal_eval(message)) print("Nodes list updated by Discovery service's server: " + str(self.nodes)) def send_nodes_to_others(self): message = str(self.nodes) for node in self.nodes: self.send_message(str_message=message, address=(node, self.port), socket=self.client_socket)
class subscriber_exchange(unittest.TestCase): apps = ('org.opencord.aaa', 'org.onosproject.dhcp') olt_apps = () #'org.opencord.cordmcast') table_app = 'org.ciena.cordigmp' dhcp_server_config = { "ip": "10.1.11.50", "mac": "ca:fe:ca:fe:ca:fe", "subnet": "255.255.252.0", "broadcast": "10.1.11.255", "router": "10.1.8.1", "domain": "8.8.8.8", "ttl": "63", "delay": "2", "startip": "10.1.11.51", "endip": "10.1.11.100" } aaa_loaded = False test_path = os.path.dirname(os.path.realpath(__file__)) table_app_file = os.path.join(test_path, '..', 'apps/ciena-cordigmp-multitable-2.0-SNAPSHOT.oar') app_file = os.path.join(test_path, '..', 'apps/ciena-cordigmp-2.0-SNAPSHOT.oar') onos_config_path = os.path.join(test_path, '..', 'setup/onos-config') olt_conf_file = os.path.join(test_path, '..', 'setup/olt_config.json') cpqd_path = os.path.join(test_path, '..', 'setup') ovs_path = cpqd_path test_services = ('IGMP', 'TRAFFIC') num_joins = 0 num_subscribers = 0 num_channels = 0 recv_timeout = False onos_restartable = not bool(int(os.getenv('ONOS_RESTART_DISABLED', 0))) @classmethod def load_device_id(cls): '''Configure the device id''' did = OnosCtrl.get_device_id() #Set the default config cls.device_id = did cls.device_dict = { "devices" : { "{}".format(did) : { "basic" : { "driver" : "pmc-olt" } } }, } return did @classmethod def setUpClass(cls): '''Load the OLT config and activate relevant apps''' did = cls.load_device_id() network_cfg = { "devices" : { "{}".format(did) : { "basic" : { "driver" : "pmc-olt" } } }, } ## Restart ONOS with cpqd driver config for OVS cls.start_onos(network_cfg = network_cfg) cls.install_app_table() cls.olt = OltConfig(olt_conf_file = cls.olt_conf_file) OnosCtrl.cord_olt_config(cls.olt.olt_device_data()) cls.port_map, cls.port_list = cls.olt.olt_port_map() cls.activate_apps(cls.apps + cls.olt_apps) @classmethod def tearDownClass(cls): '''Deactivate the olt apps and restart OVS back''' apps = cls.olt_apps + ( cls.table_app,) for app in apps: onos_ctrl = OnosCtrl(app) onos_ctrl.deactivate() cls.uninstall_app_table() cls.start_onos(network_cfg = {}) @classmethod def activate_apps(cls, apps): for app in apps: onos_ctrl = OnosCtrl(app) status, _ = onos_ctrl.activate() assert_equal(status, True) time.sleep(2) @classmethod def install_app_table(cls): ##Uninstall the existing app if any OnosCtrl.uninstall_app(cls.table_app) time.sleep(2) log.info('Installing the multi table app %s for subscriber test' %(cls.table_app_file)) OnosCtrl.install_app(cls.table_app_file) time.sleep(3) @classmethod def uninstall_app_table(cls): ##Uninstall the table app on class exit OnosCtrl.uninstall_app(cls.table_app) time.sleep(2) log.info('Installing back the cord igmp app %s for subscriber test on exit' %(cls.app_file)) OnosCtrl.install_app(cls.app_file) @classmethod def start_onos(cls, network_cfg = None): if cls.onos_restartable is False: log.info('ONOS restart is disabled. Skipping ONOS restart') return if network_cfg is None: network_cfg = cls.device_dict if type(network_cfg) is tuple: res = [] for v in network_cfg: res += v.items() config = dict(res) else: config = network_cfg log.info('Restarting ONOS with new network configuration') return cord_test_onos_restart(config = config) @classmethod def remove_onos_config(cls): try: os.unlink('{}/network-cfg.json'.format(cls.onos_config_path)) except: pass @classmethod def start_cpqd(cls, mac = '00:11:22:33:44:55'): dpid = mac.replace(':', '') cpqd_file = os.sep.join( (cls.cpqd_path, 'cpqd.sh') ) cpqd_cmd = '{} {}'.format(cpqd_file, dpid) ret = os.system(cpqd_cmd) assert_equal(ret, 0) time.sleep(10) device_id = 'of:{}{}'.format('0'*4, dpid) return device_id @classmethod def start_ovs(cls): ovs_file = os.sep.join( (cls.ovs_path, 'of-bridge.sh') ) ret = os.system(ovs_file) assert_equal(ret, 0) time.sleep(30) def onos_aaa_load(self): if self.aaa_loaded: return aaa_dict = {'apps' : { 'org.onosproject.aaa' : { 'AAA' : { 'radiusSecret': 'radius_password', 'radiusIp': '172.17.0.2' } } } } radius_ip = os.getenv('ONOS_AAA_IP') or '172.17.0.2' aaa_dict['apps']['org.onosproject.aaa']['AAA']['radiusIp'] = radius_ip self.onos_load_config('org.onosproject.aaa', aaa_dict) self.aaa_loaded = True def onos_dhcp_table_load(self, config = None): dhcp_dict = {'apps' : { 'org.onosproject.dhcp' : { 'dhcp' : copy.copy(self.dhcp_server_config) } } } dhcp_config = dhcp_dict['apps']['org.onosproject.dhcp']['dhcp'] if config: for k in config.keys(): if dhcp_config.has_key(k): dhcp_config[k] = config[k] self.onos_load_config('org.onosproject.dhcp', dhcp_dict) def onos_load_config(self, app, config): status, code = OnosCtrl.config(config) if status is False: log.info('JSON config request for app %s returned status %d' %(app, code)) assert_equal(status, True) time.sleep(2) def dhcp_sndrcv(self, dhcp, update_seed = False): cip, sip = dhcp.discover(update_seed = update_seed) assert_not_equal(cip, None) assert_not_equal(sip, None) log.info('Got dhcp client IP %s from server %s for mac %s' % (cip, sip, dhcp.get_mac(cip)[0])) return cip,sip def dhcp_request(self, subscriber, seed_ip = '10.10.10.1', update_seed = False): config = {'startip':'10.10.10.20', 'endip':'10.10.10.200', 'ip':'10.10.10.2', 'mac': "ca:fe:ca:fe:ca:fe", 'subnet': '255.255.255.0', 'broadcast':'10.10.10.255', 'router':'10.10.10.1'} self.onos_dhcp_table_load(config) dhcp = DHCPTest(seed_ip = seed_ip, iface = subscriber.iface) cip, sip = self.dhcp_sndrcv(dhcp, update_seed = update_seed) return cip, sip def recv_channel_cb(self, pkt): ##First verify that we have received the packet for the joined instance chan = self.subscriber.caddr(pkt[IP].dst) assert_equal(chan in self.subscriber.join_map.keys(), True) recv_time = monotonic.monotonic() * 1000000 join_time = self.subscriber.join_map[chan][self.subscriber.STATS_JOIN].start delta = recv_time - join_time self.subscriber.join_rx_stats.update(packets=1, t = delta, usecs = True) self.subscriber.channel_update(chan, self.subscriber.STATS_RX, 1, t = delta) log.debug('Packet received in %.3f usecs for group %s after join' %(delta, pkt[IP].dst)) self.test_status = True def traffic_verify(self, subscriber): if subscriber.has_service('TRAFFIC'): url = 'http://www.google.com' resp = requests.get(url) self.test_status = resp.ok if resp.ok == False: log.info('Subscriber %s failed get from url %s with status code %d' %(subscriber.name, url, resp.status_code)) else: log.info('GET request from %s succeeded for subscriber %s' %(url, subscriber.name)) def tls_verify(self, subscriber): if subscriber.has_service('TLS'): time.sleep(2) tls = TLSAuthTest(intf = subscriber.rx_intf) log.info('Running subscriber %s tls auth test' %subscriber.name) tls.runTest() self.test_status = True def dhcp_verify(self, subscriber): if subscriber.has_service('DHCP'): cip, sip = self.dhcp_request(subscriber, update_seed = True) log.info('Subscriber %s got client ip %s from server %s' %(subscriber.name, cip, sip)) subscriber.src_list = [cip] self.test_status = True else: subscriber.src_list = ['10.10.10.{}'.format(subscriber.rx_port)] self.test_status = True def dhcp_jump_verify(self, subscriber): if subscriber.has_service('DHCP'): cip, sip = self.dhcp_request(subscriber, seed_ip = '10.10.200.1') log.info('Subscriber %s got client ip %s from server %s' %(subscriber.name, cip, sip)) subscriber.src_list = [cip] self.test_status = True else: subscriber.src_list = ['10.10.10.{}'.format(subscriber.rx_port)] self.test_status = True def dhcp_next_verify(self, subscriber): if subscriber.has_service('DHCP'): cip, sip = self.dhcp_request(subscriber, seed_ip = '10.10.150.1') log.info('Subscriber %s got client ip %s from server %s' %(subscriber.name, cip, sip)) subscriber.src_list = [cip] self.test_status = True else: subscriber.src_list = ['10.10.10.{}'.format(subscriber.rx_port)] self.test_status = True def igmp_verify(self, subscriber): chan = 0 if subscriber.has_service('IGMP'): ##We wait for all the subscribers to join before triggering leaves if subscriber.rx_port > 1: time.sleep(5) subscriber.channel_join(chan, delay = 0) self.num_joins += 1 while self.num_joins < self.num_subscribers: time.sleep(5) log.info('All subscribers have joined the channel') for i in range(10): subscriber.channel_receive(chan, cb = subscriber.recv_channel_cb, count = 10) log.info('Leaving channel %d for subscriber %s' %(chan, subscriber.name)) subscriber.channel_leave(chan) time.sleep(5) log.info('Interface %s Join RX stats for subscriber %s, %s' %(subscriber.iface, subscriber.name,subscriber.join_rx_stats)) #Should not receive packets for this subscriber self.recv_timeout = True subscriber.recv_timeout = True subscriber.channel_receive(chan, cb = subscriber.recv_channel_cb, count = 10) subscriber.recv_timeout = False self.recv_timeout = False log.info('Joining channel %d for subscriber %s' %(chan, subscriber.name)) subscriber.channel_join(chan, delay = 0) self.test_status = True def igmp_jump_verify(self, subscriber): if subscriber.has_service('IGMP'): for i in xrange(subscriber.num): log.info('Subscriber %s jumping channel' %subscriber.name) chan = subscriber.channel_jump(delay=0) subscriber.channel_receive(chan, cb = subscriber.recv_channel_cb, count = 1) log.info('Verified receive for channel %d, subscriber %s' %(chan, subscriber.name)) time.sleep(3) log.info('Interface %s Jump RX stats for subscriber %s, %s' %(subscriber.iface, subscriber.name, subscriber.join_rx_stats)) self.test_status = True def igmp_next_verify(self, subscriber): if subscriber.has_service('IGMP'): for i in xrange(subscriber.num): if i: chan = subscriber.channel_join_next(delay=0) else: chan = subscriber.channel_join(i, delay=0) log.info('Joined next channel %d for subscriber %s' %(chan, subscriber.name)) subscriber.channel_receive(chan, cb = subscriber.recv_channel_cb, count=1) log.info('Verified receive for channel %d, subscriber %s' %(chan, subscriber.name)) time.sleep(3) log.info('Interface %s Join Next RX stats for subscriber %s, %s' %(subscriber.iface, subscriber.name, subscriber.join_rx_stats)) self.test_status = True def generate_port_list(self, subscribers, channels): return self.port_list[:subscribers] def subscriber_load(self, create = True, num = 10, num_channels = 1, channel_start = 0, port_list = []): '''Load the subscriber from the database''' self.subscriber_db = SubscriberDB(create = create, services = self.test_services) if create is True: self.subscriber_db.generate(num) self.subscriber_info = self.subscriber_db.read(num) self.subscriber_list = [] if not port_list: port_list = self.generate_port_list(num, num_channels) index = 0 for info in self.subscriber_info: self.subscriber_list.append(Subscriber(name=info['Name'], service=info['Service'], port_map = self.port_map, num=num_channels, channel_start = channel_start, tx_port = port_list[index][0], rx_port = port_list[index][1])) if num_channels > 1: channel_start += num_channels index += 1 #load the ssm list for all subscriber channels igmpChannel = IgmpChannel() ssm_groups = map(lambda sub: sub.channels, self.subscriber_list) ssm_list = reduce(lambda ssm1, ssm2: ssm1+ssm2, ssm_groups) igmpChannel.igmp_load_ssm_config(ssm_list) def subscriber_join_verify( self, num_subscribers = 10, num_channels = 1, channel_start = 0, cbs = None, port_list = []): self.test_status = False self.num_subscribers = num_subscribers self.subscriber_load(create = True, num = num_subscribers, num_channels = num_channels, channel_start = channel_start, port_list = port_list) self.onos_aaa_load() self.thread_pool = ThreadPool(min(100, self.num_subscribers), queue_size=1, wait_timeout=1) chan_leave = False #for single channel, multiple subscribers if cbs is None: cbs = (self.tls_verify, self.dhcp_verify, self.igmp_verify, self.traffic_verify) chan_leave = True for subscriber in self.subscriber_list: subscriber.start() pool_object = subscriber_pool(subscriber, cbs) self.thread_pool.addTask(pool_object.pool_cb) self.thread_pool.cleanUpThreads() for subscriber in self.subscriber_list: subscriber.stop() if chan_leave is True: subscriber.channel_leave(0) self.num_subscribers = 0 return self.test_status def test_subscriber_join_recv(self): """Test subscriber join and receive for channel surfing""" self.num_subscribers = 5 self.num_channels = 1 test_status = True ##Run this test only if ONOS can be restarted as it incurs a network-cfg change if self.onos_restartable is True: test_status = self.subscriber_join_verify(num_subscribers = self.num_subscribers, num_channels = self.num_channels, port_list = self.generate_port_list(self.num_subscribers, self.num_channels)) assert_equal(test_status, True) def test_subscriber_join_jump(self): """Test subscriber join jump for channel surfing""" self.num_subscribers = 5 self.num_channels = 10 test_status = self.subscriber_join_verify(num_subscribers = self.num_subscribers, num_channels = self.num_channels, cbs = (self.tls_verify, self.dhcp_jump_verify, self.igmp_jump_verify, self.traffic_verify), port_list = self.generate_port_list(self.num_subscribers, self.num_channels)) assert_equal(test_status, True) def test_subscriber_join_next(self): """Test subscriber join next for channel surfing""" self.num_subscribers = 5 self.num_channels = 10 test_status = self.subscriber_join_verify(num_subscribers = self.num_subscribers, num_channels = self.num_channels, cbs = (self.tls_verify, self.dhcp_next_verify, self.igmp_next_verify, self.traffic_verify), port_list = self.generate_port_list(self.num_subscribers, self.num_channels)) assert_equal(test_status, True)
class SPM_MailMonitor: log = logging.getLogger('Storage.MailBox.SpmMailMonitor') def registerMessageType(self, messageType, callback): self._messageTypes[messageType] = callback def unregisterMessageType(self, messageType): del self._messageTypes[messageType] def __init__(self, pool, maxHostID, monitorInterval=2): self._messageTypes = {} # Save arguments self._stop = False self._stopped = False self._poolID = str(pool.spUUID) self._spmStorageDir = pool.storage_repository tpSize = config.getfloat('irs', 'thread_pool_size') / 2 waitTimeout = 3 maxTasks = config.getfloat('irs', 'max_tasks') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) # *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice versa *** # self._inbox = os.path.join(self._spmStorageDir, self._poolID, "mastersd", sd.DOMAIN_META_DATA, "inbox") if not os.path.exists(self._inbox): self.log.error("SPM_MailMonitor create failed - inbox %s does not exist" % repr(self._inbox)) raise RuntimeError("SPM_MailMonitor create failed - inbox %s does not exist" % repr(self._inbox)) self._outbox = os.path.join(self._spmStorageDir, self._poolID, "mastersd", sd.DOMAIN_META_DATA, "outbox") if not os.path.exists(self._outbox): self.log.error("SPM_MailMonitor create failed - outbox %s does not exist" % repr(self._outbox)) raise RuntimeError("SPM_MailMonitor create failed - outbox %s does not exist" % repr(self._outbox)) self._numHosts = int(maxHostID) self._outMailLen = MAILBOX_SIZE * self._numHosts self._monitorInterval = monitorInterval # TODO: add support for multiple paths (multiple mailboxes) self._outgoingMail = self._outMailLen * "\0" self._incomingMail = self._outgoingMail self._inCmd = ['dd', 'if=' + str(self._inbox), 'iflag=direct,fullblock', 'count=1' ] self._outCmd = ['dd', 'of=' + str(self._outbox), 'oflag=direct', 'iflag=fullblock', 'conv=notrunc', 'count=1' ] self._outLock = thread.allocate_lock() self._inLock = thread.allocate_lock() # Clear outgoing mail self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: %s", self._outCmd) cmd = self._outCmd + ['bs=' + str(self._outMailLen)] (rc, out, err) = misc.execCmd(cmd, sudo=False, data=self._outgoingMail) if rc: self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, dd failed") thread.start_new_thread(self.run, (self, )) self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID) def stop(self): self._stop = True def isStopped(self): return self._stopped def getMaxHostID(self): return self._numHosts def setMaxHostID(self, newMaxId): self._inLock.acquire() self._outLock.acquire() diff = newMaxId - self._numHosts if diff > 0: delta = MAILBOX_SIZE * diff * "\0" self._outgoingMail += delta self._incomingMail += delta elif diff < 0: delta = MAILBOX_SIZE * diff self._outgoingMail = self._outgoingMail[:-delta] self._incomingMail = self._incomingMail[:-delta] self._numHosts=newMaxId self._outMailLen = MAILBOX_SIZE * self._numHosts self._outLock.release() self._inLock.release() def _validateMailbox(self, mailbox, mailboxIndex): chkStart = MAILBOX_SIZE-CHECKSUM_BYTES chk = misc.checksum(mailbox[0 : chkStart], CHECKSUM_BYTES) pChk = struct.pack('<l',chk) # Assumes CHECKSUM_BYTES equals 4!!! if pChk != mailbox[chkStart : chkStart+CHECKSUM_BYTES]: self.log.error("SPM_MailMonitor: mailbox %s checksum failed, not clearing mailbox, clearing newMail.", str(mailboxIndex)) return False elif pChk == pZeroChecksum: return False # Ignore messages of empty mailbox return True def _handleRequests(self, newMail): send = False # run through all messages and check if new messages have arrived (since last read) for host in range(0, self._numHosts): # Check mailbox checksum mailboxStart = host * MAILBOX_SIZE isMailboxValidated = False for i in range(0, MESSAGES_PER_MAILBOX): msgId = host * SLOTS_PER_MAILBOX + i msgStart = msgId * MESSAGE_SIZE # First byte of message is message version. Check message version, if 0 then message is empty and can be skipped if newMail[msgStart] in ['\0', '0']: continue # Most mailboxes are probably empty so it costs less to check that all messages start with 0 than # to validate the mailbox, therefor this is done after we find a non empty message in mailbox if not isMailboxValidated: if not self._validateMailbox(newMail[mailboxStart : mailboxStart + MAILBOX_SIZE], host): #Cleaning invalid mbx in newMail newMail = newMail[:mailboxStart] + EMPTYMAILBOX + newMail[mailboxStart + MAILBOX_SIZE:] break self.log.debug("SPM_MailMonitor: Mailbox %s validated, checking mail", host) isMailboxValidated = True newMsg = newMail[msgStart : msgStart+MESSAGE_SIZE] msgOffset = msgId * MESSAGE_SIZE if newMsg == CLEAN_MESSAGE: # Should probably put a setter on outgoingMail which would take the lock self._outLock.acquire() try: self._outgoingMail = self._outgoingMail[0:msgOffset] + CLEAN_MESSAGE + self._outgoingMail[msgOffset+MESSAGE_SIZE : self._outMailLen] finally: self._outLock.release() send = True continue # Message isn't empty, check if its new isMessageNew = False for j in range(msgStart, msgStart + MESSAGE_SIZE): if newMail[j] != self._incomingMail[j]: isMessageNew = True break # If search exhausted, i.e. message hasn't changed since last read, it can be skipped if not isMessageNew: continue # We only get here if there is a novel request try: msgType = newMail[msgStart+1 : msgStart+5] if msgType in self._messageTypes: # Use message class to process request according to message specific logic id = str(uuid.uuid4()) self.log.debug("SPM_MailMonitor: processing request: %s" % repr(newMail[msgStart : msgStart+MESSAGE_SIZE])) res = self.tp.queueTask(id, runTask, (self._messageTypes[msgType], msgId, newMail[msgStart : msgStart+MESSAGE_SIZE]) ) if not res: raise Exception() else: self.log.error("SPM_MailMonitor: unknown message type encountered: %s", msgType) except RuntimeError, e: self.log.error("SPM_MailMonitor: exception: %s caught while handling message: %s", str(e), newMail[msgStart:msgStart + MESSAGE_SIZE]) except: self.log.error("SPM_MailMonitor: exception caught while handling message: %s", newMail[msgStart:msgStart + MESSAGE_SIZE], exc_info=True)
class HSM_MailMonitor(threading.Thread): log = logging.getLogger('Storage.MailBox.HsmMailMonitor') def __init__(self, inbox, outbox, hostID, queue, monitorInterval): # Save arguments tpSize = config.getfloat('irs', 'thread_pool_size') / 2 waitTimeout = 3 maxTasks = config.getfloat('irs', 'max_tasks') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) self._stop = False self._flush = False self._queue = queue self._activeMessages = {} self._monitorInterval = monitorInterval self._hostID = int(hostID) self._used_slots_array = [ 0 ] * MESSAGES_PER_MAILBOX self._outgoingMail = EMPTYMAILBOX self._incomingMail = EMPTYMAILBOX # TODO: add support for multiple paths (multiple mailboxes) self._spmStorageDir = config.get('irs', 'repository') self._inCmd = [ constants.EXT_DD, 'if=' + str(inbox), 'iflag=direct,fullblock', 'bs=' + str(BLOCK_SIZE), 'count=' + str(BLOCKS_PER_MAILBOX), 'skip=' + str(self._hostID*BLOCKS_PER_MAILBOX) ] self._outCmd = [constants.EXT_DD, 'of=' + str(outbox), 'iflag=fullblock', 'oflag=direct', 'conv=notrunc', 'bs=' + str(BLOCK_SIZE), 'seek=' + str(self._hostID*BLOCKS_PER_MAILBOX) ] self._init = False self._initMailbox() # Read initial mailbox state self._msgCounter = 0 self._sendMail() # Clear outgoing mailbox threading.Thread.__init__(self) self.start() def _initMailbox(self): # Sync initial incoming mail state with storage view (rc, out, err) = misc.execCmd(self._inCmd, sudo=False, raw=True) if rc == 0: self._incomingMail = out self._init = True else: self.log.warning("HSM_MailboxMonitor - Could not initialize mailbox, will not accept requests until init succeeds") def immStop(self): self._stop = True def immFlush(self): self._flush = True def _handleResponses(self, newMsgs): rc = False for i in range(0, MESSAGES_PER_MAILBOX): # Skip checking non used slots if self._used_slots_array[i] == 0: continue # Skip empty return messages (messages with version 0) start = i*MESSAGE_SIZE # First byte of message is message version. # Check return message version, if 0 then message is empty if newMsgs[start] in ['\0', '0']: continue for j in range(start, start + MESSAGE_SIZE): if newMsgs[j] != self._incomingMail[j]: break # If search exhausted then message hasn't changed since last read and can be skipped if j == (start + MESSAGE_SIZE - 1): continue # # We only get here if there is a novel reply so we can remove the message from the active list # and the outgoing mail and handle the reply # rc = True newMsg = newMsgs[start : start + MESSAGE_SIZE] if newMsg == CLEAN_MESSAGE: del self._activeMessages[i] self._used_slots_array[i] = 0 self._msgCounter -= 1 self._outgoingMail = self._outgoingMail[0 : start] + MESSAGE_SIZE * "\0" + self._outgoingMail[start + MESSAGE_SIZE : ] continue msg = self._activeMessages[i] self._activeMessages[i] = CLEAN_MESSAGE self._outgoingMail = self._outgoingMail[0 : start] + CLEAN_MESSAGE + self._outgoingMail[start + MESSAGE_SIZE : ] try: self.log.debug("HSM_MailboxMonitor(%s/%s) - Checking reply: %s", self._msgCounter, MESSAGES_PER_MAILBOX, repr(newMsg)) msg.checkReply(newMsg) if msg.callback: try: id = str(uuid.uuid4()) if not self.tp.queueTask(id, runTask, (msg.callback, msg.volumeData)): raise Exception() except: self.log.error("HSM_MailMonitor: exception caught while running msg callback, for message: %s, callback function: %s", repr(msg.payload), msg.callback, exc_info=True) except RuntimeError, e: self.log.error("HSM_MailMonitor: exception: %s caught while checking reply for message: %s, reply: %s", \ str(e), repr(msg.payload), repr(newMsg)) except:
class Crawler(object): def __init__(self, args=Strategy()): self.url = args.url self.max_depth = args.max_depth #指定网页深度 self.max_count = args.max_count #爬行最大数量 self.concurrency = args.concurrency #线程数 self.timeout = args.timeout #超时 self.cookies = args.cookies #cookies self.ssl_verify = args.ssl_verify #ssl self.same_host = args.same_host #是否只抓取相同host的链接 self.same_domain = args.same_domain #是否只抓取相同domain的链接 self.currentDepth = 1 #标注初始爬虫深度,从1开始 self.keyword = args.keyword #指定关键词,使用console的默认编码来解码 self.threadPool = ThreadPool(args.concurrency) #线程池,指定线程数 self.visitedHrefs = set() #已访问的链接 self.unvisitedHrefs = deque() #待访问的链接 self.unvisitedHrefs.append(args.url) #添加首个待访问的链接 self.isCrawling = False #标记爬虫是否开始执行任务 self.file = BASEDIR + '/cache/crawler/' + genFilename( self.url) + '.txt' # print self.file # print 'args.url=\t',args.url ################# #此句有问题 self.database = Database(args.dbFile) #数据库 # print 'hehe' self.lock = Lock() def start(self): # print '\nStart Crawling\n' if not self._isDatabaseAvaliable(): # print 'Error: Unable to open database file.\n' pass else: pass if True: self.isCrawling = True self.threadPool.startThreads() while self.currentDepth <= self.max_depth and len( self.visitedHrefs) <= self.max_count: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks() #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt counter = 0 while self.threadPool.getTaskLeft() and counter < 600: # print '>>taskleft:\t',self.threadPool.getTaskLeft() # print self.threadPool.taskQueue.qsize() # print self.threadPool.resultQueue.qsize() # print self.threadPool.running time.sleep(1) counter += 1 # self.threadPool.taskJoin() # print 'Depth %d Finish. Totally visited %d links. \n' % ( # self.currentDepth, len(self.visitedHrefs)) # log.info('Depth %d Finish. Total visited Links: %d\n' % ( # self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def stop(self): self.isCrawling = False self.threadPool.stopThreads() # self.database.close() def saveAllHrefsToFile(self, nonehtml=True): try: cf = CrawlerFile(url=self.url) contentlist = [] hrefs = [i for i in self.visitedHrefs ] + [j for j in self.unvisitedHrefs] for href in hrefs: if href.endswith('.html') and nonehtml: continue contentlist.append(href) cf.saveSection('Hrefs', contentlist, coverfile=True) # fp = open(self.file,'w') # fp.write('[Hrefs]'+os.linesep) # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] # rethrefs = [] # print 'Totally ',len(hrefs), ' hrefs' # for href in hrefs: # if href.endswith('.html'): # continue # rethrefs.append(href) # fp.write(href + os.linesep) # print href # print 'Totally ',len(rethrefs), ' aviable hrefs' # fp.close() except: pass def _getCrawlerPaths(self, url): ''' ''' try: paths = [] baseulp = urlparse(url) cf = CrawlerFile(url=url) urls = cf.getSection('Hrefs') #print urls for eachline in urls: eachline = eachline.replace('\r', '') eachline = eachline.replace('\n', '') #print eachline eachulp = urlparse(eachline) if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc: fullpath = eachulp.path if fullpath.find('.') == -1 and fullpath.endswith( '/') == False: fullpath += '/' pos = 0 while True: # print 'fullpath=',fullpath pos = fullpath.find('/', pos) if pos == -1: break tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[: pos] if tmppth.endswith('/'): #tmppth = tmppth[:-1] continue if tmppth not in paths: paths.append(tmppth) pos += 1 return paths except Exception, e: print 'Exception:\t', e return [url]
class CheckFileService(UDPService): def __init__(self, ip, port, nodes, timeout, directory, file_transfer_service): super(CheckFileService, self).__init__(name='CheckFile', ip=ip, port=port) self.nodes = nodes self.directory = directory self.client_socket.settimeout(timeout) self.threadpool = ThreadPool(2) self.file_transfer_service = file_transfer_service def start_service(self): self.threadpool.add_task(self.start_server) self.threadpool.wait_completion() def process_server_response(self, message, address): file_existance = str(self.check_file_existance(message)) tcp_server_port = str(self.file_transfer_service.get_server_port()) self.send_message(str_message=file_existance + ',' + tcp_server_port, address=address, socket=self.server_socket) def check_file_existance(self, file_path): return path.exists(self.directory + '/' + file_path) def get_nodes_response_times(self, file_request_message): nodes_responses = [] for node in self.nodes: if node is not self.ip: address = (node, self.port) start_time = time() self.send_message(str_message=file_request_message, address=address, socket=self.client_socket) try: response = self.get_message(self.client_socket) nodes_responses.append({ "node": node, "response": response, "response_time": time() - start_time }) except timeout: print(node + " node timed out") pass return nodes_responses def process_get_file_request(self, file_path): file_request_message = file_path nodes_responses = self.get_nodes_response_times(file_request_message) nodes_has_file = [] for node_response in nodes_responses: response_message = nodes_responses["response"]["message"].split( ',') node_has_file = response_message[0] == "True" if node_has_file: nodes_has_file.append({ "node": node_response["node"], "node_port": response_message[1], "response_time": node_response["response_time"] }) if nodes_has_file is not None: return min(nodes_has_file, key=lambda t: t["response_time"]) else: return None
class SPM_MailMonitor: log = logging.getLogger('storage.MailBox.SpmMailMonitor') def registerMessageType(self, messageType, callback): self._messageTypes[messageType] = callback def unregisterMessageType(self, messageType): del self._messageTypes[messageType] def __init__(self, pool, maxHostID, monitorInterval=2): self._messageTypes = {} # Save arguments self._stop = False self._stopped = False self._poolID = str(pool.spUUID) self._spmStorageDir = pool.storage_repository tpSize = config.getint('irs', 'thread_pool_size') / 2 waitTimeout = 3 maxTasks = config.getint('irs', 'max_tasks') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) # *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice # versa *** # self._inbox = os.path.join(self._spmStorageDir, self._poolID, "mastersd", sd.DOMAIN_META_DATA, "inbox") if not os.path.exists(self._inbox): self.log.error("SPM_MailMonitor create failed - inbox %s does not " "exist" % repr(self._inbox)) raise RuntimeError("SPM_MailMonitor create failed - inbox %s does " "not exist" % repr(self._inbox)) self._outbox = os.path.join(self._spmStorageDir, self._poolID, "mastersd", sd.DOMAIN_META_DATA, "outbox") if not os.path.exists(self._outbox): self.log.error("SPM_MailMonitor create failed - outbox %s does " "not exist" % repr(self._outbox)) raise RuntimeError("SPM_MailMonitor create failed - outbox %s " "does not exist" % repr(self._outbox)) self._numHosts = int(maxHostID) self._outMailLen = MAILBOX_SIZE * self._numHosts self._monitorInterval = monitorInterval # TODO: add support for multiple paths (multiple mailboxes) self._outgoingMail = self._outMailLen * "\0" self._incomingMail = self._outgoingMail self._inCmd = [ 'dd', 'if=' + str(self._inbox), 'iflag=direct,fullblock', 'count=1' ] self._outCmd = [ 'dd', 'of=' + str(self._outbox), 'oflag=direct', 'iflag=fullblock', 'conv=notrunc', 'count=1' ] self._outLock = threading.Lock() self._inLock = threading.Lock() # Clear outgoing mail self.log.debug( "SPM_MailMonitor - clearing outgoing mail, command is: " "%s", self._outCmd) cmd = self._outCmd + ['bs=' + str(self._outMailLen)] (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail) if rc: self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, " "dd failed") t = concurrent.thread(self.run, name="mailbox/spm", logger=self.log.name) t.start() self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID) def stop(self): self._stop = True def isStopped(self): return self._stopped def getMaxHostID(self): return self._numHosts def setMaxHostID(self, newMaxId): with self._inLock: with self._outLock: diff = newMaxId - self._numHosts if diff > 0: delta = MAILBOX_SIZE * diff * "\0" self._outgoingMail += delta self._incomingMail += delta elif diff < 0: delta = MAILBOX_SIZE * diff self._outgoingMail = self._outgoingMail[:-delta] self._incomingMail = self._incomingMail[:-delta] self._numHosts = newMaxId self._outMailLen = MAILBOX_SIZE * self._numHosts def _validateMailbox(self, mailbox, mailboxIndex): chkStart = MAILBOX_SIZE - CHECKSUM_BYTES chk = misc.checksum(mailbox[0:chkStart], CHECKSUM_BYTES) pChk = struct.pack('<l', chk) # Assumes CHECKSUM_BYTES equals 4!!! if pChk != mailbox[chkStart:chkStart + CHECKSUM_BYTES]: self.log.error( "SPM_MailMonitor: mailbox %s checksum failed, not " "clearing mailbox, clearing newMail.", str(mailboxIndex)) return False elif pChk == pZeroChecksum: return False # Ignore messages of empty mailbox return True def _handleRequests(self, newMail): send = False # run through all messages and check if new messages have arrived # (since last read) for host in range(0, self._numHosts): # Check mailbox checksum mailboxStart = host * MAILBOX_SIZE isMailboxValidated = False for i in range(0, MESSAGES_PER_MAILBOX): msgId = host * SLOTS_PER_MAILBOX + i msgStart = msgId * MESSAGE_SIZE # First byte of message is message version. Check message # version, if 0 then message is empty and can be skipped if newMail[msgStart] in ['\0', '0']: continue # Most mailboxes are probably empty so it costs less to check # that all messages start with 0 than to validate the mailbox, # therefor this is done after we find a non empty message in # mailbox if not isMailboxValidated: if not self._validateMailbox( newMail[mailboxStart:mailboxStart + MAILBOX_SIZE], host): # Cleaning invalid mbx in newMail newMail = newMail[:mailboxStart] + EMPTYMAILBOX + \ newMail[mailboxStart + MAILBOX_SIZE:] break self.log.debug( "SPM_MailMonitor: Mailbox %s validated, " "checking mail", host) isMailboxValidated = True newMsg = newMail[msgStart:msgStart + MESSAGE_SIZE] msgOffset = msgId * MESSAGE_SIZE if newMsg == CLEAN_MESSAGE: # Should probably put a setter on outgoingMail which would # take the lock self._outLock.acquire() try: self._outgoingMail = \ self._outgoingMail[0:msgOffset] + CLEAN_MESSAGE + \ self._outgoingMail[msgOffset + MESSAGE_SIZE: self._outMailLen] finally: self._outLock.release() send = True continue # Message isn't empty, check if its new isMessageNew = False for j in range(msgStart, msgStart + MESSAGE_SIZE): if newMail[j] != self._incomingMail[j]: isMessageNew = True break # If search exhausted, i.e. message hasn't changed since last # read, it can be skipped if not isMessageNew: continue # We only get here if there is a novel request try: msgType = newMail[msgStart + 1:msgStart + 5] if msgType in self._messageTypes: # Use message class to process request according to # message specific logic id = str(uuid.uuid4()) self.log.debug( "SPM_MailMonitor: processing request: " "%s" % repr(newMail[msgStart:msgStart + MESSAGE_SIZE])) res = self.tp.queueTask( id, runTask, (self._messageTypes[msgType], msgId, newMail[msgStart:msgStart + MESSAGE_SIZE])) if not res: raise Exception() else: self.log.error( "SPM_MailMonitor: unknown message type " "encountered: %s", msgType) except RuntimeError as e: self.log.error( "SPM_MailMonitor: exception: %s caught " "while handling message: %s", str(e), newMail[msgStart:msgStart + MESSAGE_SIZE]) except: self.log.error( "SPM_MailMonitor: exception caught while " "handling message: %s", newMail[msgStart:msgStart + MESSAGE_SIZE], exc_info=True) self._incomingMail = newMail return send def _checkForMail(self): # Lock is acquired in order to make sure that neither _numHosts nor # incomingMail are changed during checkForMail self._inLock.acquire() try: # self.log.debug("SPM_MailMonitor -_checking for mail") cmd = self._inCmd + ['bs=' + str(self._outMailLen)] # self.log.debug("SPM_MailMonitor - reading incoming mail, " # "command: " + str(cmd)) (rc, in_mail, err) = misc.execCmd(cmd, raw=True) if rc: raise IOError( errno.EIO, "_handleRequests._checkForMail - " "Could not read mailbox: %s" % self._inbox) if (len(in_mail) != (self._outMailLen)): self.log.error( 'SPM_MailMonitor: _checkForMail - dd succeeded ' 'but read %d bytes instead of %d, cannot check ' 'mail. Read mail contains: %s', len(in_mail), self._outMailLen, repr(in_mail[:80])) raise RuntimeError("_handleRequests._checkForMail - Could not " "read mailbox") # self.log.debug("Parsing inbox content: %s", in_mail) if self._handleRequests(in_mail): self._outLock.acquire() try: cmd = self._outCmd + ['bs=' + str(self._outMailLen)] (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail) if rc: self.log.warning("SPM_MailMonitor couldn't write " "outgoing mail, dd failed") finally: self._outLock.release() finally: self._inLock.release() def sendReply(self, msgID, msg): # Lock is acquired in order to make sure that neither _numHosts nor # outgoingMail are changed while used self._outLock.acquire() try: msgOffset = msgID * MESSAGE_SIZE self._outgoingMail = \ self._outgoingMail[0:msgOffset] + msg.payload + \ self._outgoingMail[msgOffset + MESSAGE_SIZE:self._outMailLen] mailboxOffset = (msgID / SLOTS_PER_MAILBOX) * MAILBOX_SIZE mailbox = self._outgoingMail[mailboxOffset:mailboxOffset + MAILBOX_SIZE] cmd = self._outCmd + [ 'bs=' + str(MAILBOX_SIZE), 'seek=' + str(mailboxOffset / MAILBOX_SIZE) ] # self.log.debug("Running command: %s, for message id: %s", # str(cmd), str(msgID)) (rc, out, err) = _mboxExecCmd(cmd, data=mailbox) if rc: self.log.error("SPM_MailMonitor: sendReply - couldn't send " "reply, dd failed") finally: self._outLock.release() def run(self): try: while not self._stop: try: self._checkForMail() except: self.log.error("Error checking for mail", exc_info=True) time.sleep(self._monitorInterval) finally: self._stopped = True self.tp.joinAll(waitForTasks=False) self.log.info("SPM_MailMonitor - Incoming mail monitoring thread " "stopped")
class HSM_MailMonitor(object): log = logging.getLogger('storage.MailBox.HsmMailMonitor') def __init__(self, inbox, outbox, hostID, queue, monitorInterval): # Save arguments tpSize = config.getint('irs', 'thread_pool_size') / 2 waitTimeout = 3 maxTasks = config.getint('irs', 'max_tasks') self.tp = ThreadPool(tpSize, waitTimeout, maxTasks) self._stop = False self._flush = False self._queue = queue self._activeMessages = {} self._monitorInterval = monitorInterval self._hostID = int(hostID) self._used_slots_array = [0] * MESSAGES_PER_MAILBOX self._outgoingMail = EMPTYMAILBOX self._incomingMail = EMPTYMAILBOX # TODO: add support for multiple paths (multiple mailboxes) self._spmStorageDir = config.get('irs', 'repository') self._inCmd = [ constants.EXT_DD, 'if=' + str(inbox), 'iflag=direct,fullblock', 'bs=' + str(BLOCK_SIZE), 'count=' + str(BLOCKS_PER_MAILBOX), 'skip=' + str(self._hostID * BLOCKS_PER_MAILBOX) ] self._outCmd = [ constants.EXT_DD, 'of=' + str(outbox), 'iflag=fullblock', 'oflag=direct', 'conv=notrunc', 'bs=' + str(BLOCK_SIZE), 'seek=' + str(self._hostID * BLOCKS_PER_MAILBOX) ] self._init = False self._initMailbox() # Read initial mailbox state self._msgCounter = 0 self._sendMail() # Clear outgoing mailbox self._thread = concurrent.thread(self.run, name="mailbox/hsm", logger=self.log.name) self._thread.start() def _initMailbox(self): # Sync initial incoming mail state with storage view (rc, out, err) = _mboxExecCmd(self._inCmd, raw=True) if rc == 0: self._incomingMail = out self._init = True else: self.log.warning("HSM_MailboxMonitor - Could not initialize " "mailbox, will not accept requests until init " "succeeds") def immStop(self): self._stop = True def immFlush(self): self._flush = True def _handleResponses(self, newMsgs): rc = False for i in range(0, MESSAGES_PER_MAILBOX): # Skip checking non used slots if self._used_slots_array[i] == 0: continue # Skip empty return messages (messages with version 0) start = i * MESSAGE_SIZE # First byte of message is message version. # Check return message version, if 0 then message is empty if newMsgs[start] in ['\0', '0']: continue for j in range(start, start + MESSAGE_SIZE): if newMsgs[j] != self._incomingMail[j]: break # If search exhausted then message hasn't changed since last read # and can be skipped if j == (start + MESSAGE_SIZE - 1): continue # # We only get here if there is a novel reply so we can remove the # message from the active list and the outgoing mail and handle the # reply # rc = True newMsg = newMsgs[start:start + MESSAGE_SIZE] if newMsg == CLEAN_MESSAGE: del self._activeMessages[i] self._used_slots_array[i] = 0 self._msgCounter -= 1 self._outgoingMail = self._outgoingMail[0:start] + \ MESSAGE_SIZE * "\0" + self._outgoingMail[start + MESSAGE_SIZE:] continue msg = self._activeMessages[i] self._activeMessages[i] = CLEAN_MESSAGE self._outgoingMail = self._outgoingMail[0:start] + \ CLEAN_MESSAGE + self._outgoingMail[start + MESSAGE_SIZE:] try: self.log.debug( "HSM_MailboxMonitor(%s/%s) - Checking reply: " "%s", self._msgCounter, MESSAGES_PER_MAILBOX, repr(newMsg)) msg.checkReply(newMsg) if msg.callback: try: id = str(uuid.uuid4()) if not self.tp.queueTask( id, runTask, (msg.callback, msg.volumeData)): raise Exception() except: self.log.error( "HSM_MailMonitor: exception caught " "while running msg callback, for " "message: %s, callback function: %s", repr(msg.payload), msg.callback, exc_info=True) except RuntimeError as e: self.log.error( "HSM_MailMonitor: exception: %s caught while " "checking reply for message: %s, reply: %s", str(e), repr(msg.payload), repr(newMsg)) except: self.log.error( "HSM_MailMonitor: exception caught while " "checking reply from SPM, request was: %s " "reply: %s", repr(msg.payload), repr(newMsg), exc_info=True) # Finished processing incoming mail, now save mail to compare against # next batch self._incomingMail = newMsgs return rc def _checkForMail(self): # self.log.debug("HSM_MailMonitor - checking for mail") # self.log.debug("Running command: " + str(self._inCmd)) (rc, in_mail, err) = misc.execCmd(self._inCmd, raw=True) if rc: raise RuntimeError("_handleResponses.Could not read mailbox - rc " "%s" % rc) if (len(in_mail) != MAILBOX_SIZE): raise RuntimeError("_handleResponses.Could not read mailbox - len " "%s != %s" % (len(in_mail), MAILBOX_SIZE)) # self.log.debug("Parsing inbox content: %s", in_mail) return self._handleResponses(in_mail) def _sendMail(self): self.log.info("HSM_MailMonitor sending mail to SPM - " + str(self._outCmd)) chk = misc.checksum( self._outgoingMail[0:MAILBOX_SIZE - CHECKSUM_BYTES], CHECKSUM_BYTES) pChk = struct.pack('<l', chk) # Assumes CHECKSUM_BYTES equals 4!!! self._outgoingMail = \ self._outgoingMail[0:MAILBOX_SIZE - CHECKSUM_BYTES] + pChk _mboxExecCmd(self._outCmd, data=self._outgoingMail) def _handleMessage(self, message): # TODO: add support for multiple mailboxes freeSlot = False for i in range(0, MESSAGES_PER_MAILBOX): if self._used_slots_array[i] == 0: if not freeSlot: freeSlot = i continue duplicate = True for j in range(0, MESSAGE_SIZE): if message[j] != self._activeMessages[i][j]: duplicate = False break if duplicate: self.log.debug("HSM_MailMonitor - ignoring duplicate message " "%s" % (repr(message))) return if not freeSlot: raise RuntimeError("HSM_MailMonitor - Active messages list full, " "cannot add new message") self._msgCounter += 1 self._used_slots_array[freeSlot] = 1 self._activeMessages[freeSlot] = message start = freeSlot * MESSAGE_SIZE end = start + MESSAGE_SIZE self._outgoingMail = self._outgoingMail[0:start] + message.payload + \ self._outgoingMail[end:] self.log.debug( "HSM_MailMonitor - start: %s, end: %s, len: %s, " "message(%s/%s): %s" % (start, end, len(self._outgoingMail), self._msgCounter, MESSAGES_PER_MAILBOX, repr(self._outgoingMail[start:end]))) def run(self): try: failures = 0 # Do not start processing requests before incoming mailbox is # initialized while not self._init and not self._stop: try: time.sleep(2) self._initMailbox() # Read initial mailbox state except: pass while not self._stop: try: message = None sendMail = False # If no message is pending, block_wait until a new message # or stop command arrives while not self._stop and not message and \ not self._activeMessages: try: # self.log.debug("No requests in queue, going to " # "sleep until new requests arrive") # Check if a new message is waiting to be sent message = self._queue.get( block=True, timeout=self._monitorInterval) self._handleMessage(message) message = None sendMail = True except Queue.Empty: pass if self._stop: break # If pending messages available, check if there are new # messages waiting in queue as well empty = False while (not empty) and \ (len(self._activeMessages) < MESSAGES_PER_MAILBOX): # TODO: Remove single mailbox limitation try: message = self._queue.get(block=False) self._handleMessage(message) message = None sendMail = True except Queue.Empty: empty = True if self._flush: self._flush = False sendMail = True try: sendMail |= self._checkForMail() failures = 0 except: self.log.error( "HSM_MailboxMonitor - Exception caught " "while checking for mail", exc_info=True) failures += 1 if sendMail: self._sendMail() # If there are active messages waiting for SPM reply, wait # a few seconds before performing another IO op if self._activeMessages and not self._stop: # If recurring failures then sleep for one minute # before retrying if (failures > 9): time.sleep(60) else: time.sleep(self._monitorInterval) except: self.log.error( "HSM_MailboxMonitor - Incoming mail" "monitoring thread caught exception; " "will try to recover", exc_info=True) finally: self.log.info("HSM_MailboxMonitor - Incoming mail monitoring " "thread stopped, clearing outgoing mail") self._outgoingMail = EMPTYMAILBOX self._sendMail() # Clear outgoing mailbox
class Crawler(object): def __init__(self, args=Strategy()): self.url = args.url self.max_depth = args.max_depth #指定网页深度 self.max_count = args.max_count #爬行最大数量 self.concurrency = args.concurrency #线程数 self.timeout = args.timeout #超时 self.cookies = args.cookies #cookies self.ssl_verify = args.ssl_verify #ssl self.same_host = args.same_host #是否只抓取相同host的链接 self.same_domain = args.same_domain #是否只抓取相同domain的链接 self.currentDepth = 1 #标注初始爬虫深度,从1开始 self.keyword = args.keyword #指定关键词,使用console的默认编码来解码 self.threadPool = ThreadPool(args.concurrency) #线程池,指定线程数 self.visitedHrefs = set() #已访问的链接 self.unvisitedHrefs = deque() #待访问的链接 self.unvisitedHrefs.append(args.url)#添加首个待访问的链接 self.isCrawling = False #标记爬虫是否开始执行任务 self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt' print self.file print 'args.url=\t',args.url ################# #此句有问题 self.database = Database(args.dbFile) #数据库 # print 'hehe' self.lock = Lock() def start(self): print '\nStart Crawling\n' if not self._isDatabaseAvaliable(): print 'Error: Unable to open database file.\n' else: pass if True: self.isCrawling = True self.threadPool.startThreads() while self.currentDepth <= self.max_depth and len(self.visitedHrefs) <= self.max_count: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks () #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt counter = 0 while self.threadPool.getTaskLeft() and counter < 600: # print '>>taskleft:\t',self.threadPool.getTaskLeft() # print self.threadPool.taskQueue.qsize() # print self.threadPool.resultQueue.qsize() # print self.threadPool.running time.sleep(1) counter += 1 # self.threadPool.taskJoin() print 'Depth %d Finish. Totally visited %d links. \n' % ( self.currentDepth, len(self.visitedHrefs)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.currentDepth, len(self.visitedHrefs))) self.currentDepth += 1 self.stop() def stop(self): self.isCrawling = False self.threadPool.stopThreads() # self.database.close() def saveAllHrefsToFile(self,nonehtml=True): try: cf = CrawlerFile(url=self.url) contentlist = [] hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] for href in hrefs: if href.endswith('.html') and nonehtml: continue contentlist.append(href) cf.saveSection('Hrefs',contentlist,coverfile=True) # fp = open(self.file,'w') # fp.write('[Hrefs]'+os.linesep) # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs] # rethrefs = [] # print 'Totally ',len(hrefs), ' hrefs' # for href in hrefs: # if href.endswith('.html'): # continue # rethrefs.append(href) # fp.write(href + os.linesep) # print href # print 'Totally ',len(rethrefs), ' aviable hrefs' # fp.close() except: pass def _getCrawlerPaths(self,url): ''' ''' try: paths = [] baseulp = urlparse(url) cf = CrawlerFile(url=url) urls = cf.getSection('Hrefs') #print urls for eachline in urls: eachline = eachline.replace('\r','') eachline = eachline.replace('\n','') #print eachline eachulp = urlparse(eachline) if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc: fullpath = eachulp.path if fullpath.find('.') == -1 and fullpath.endswith('/') == False: fullpath += '/' pos = 0 while True: pos = fullpath.find('/',pos) if pos == -1: break tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:pos] if tmppth.endswith('/'): #tmppth = tmppth[:-1] continue if tmppth not in paths: paths.append(tmppth) pos +=1 return paths except Exception,e: print 'Exception:\t',e return [url]
class Crawler(object): def __init__(self, args, startURLs): #指定网页深度 self.depth = args.depth #标注初始爬虫深度,从1开始 self.currentDepth = 1 #指定关键词,使用console的默认编码来解码 #self.keyword = args.keyword.decode(getdefaultlocale()[1]) #数据库 self.database = Database(args.dbFile) # store group ids to fils, using UTF-8 self.groupfile = codecs.open("GroupID.txt", "w", "UTF-8") #线程池,指定线程数 self.threadPool = ThreadPool(args.threadNum) #已访问的小组id self.visitedGroups = set() #待访问的小组id self.unvisitedGroups = deque() # 所有的Group信息 self.groupInfo = [] self.lock = Lock() #线程锁 #标记爬虫是否开始执行任务 self.isCrawling = False # 添加尚未访问的小组首页 for url in startURLs: match_obj = REGroup.match(url) print "Add start urls:", url assert(match_obj != None) self.unvisitedGroups.append(match_obj.group(1)) # 一分钟内允许的最大访问次数 self.MAX_VISITS_PER_MINUTE = 10 # 当前周期内已经访问的网页数量 self.currentPeriodVisits = 0 # 将一分钟当作一个访问周期,记录当前周期的开始时间 self.periodStart = time.time() # 使用当前时间初始化 def start(self): print '\nStart Crawling\n' if not self._isDatabaseAvaliable(): print 'Error: Unable to open database file.\n' else: self.isCrawling = True self.threadPool.startThreads() self.periodStart = time.time() # 当前周期开始 # 按照depth来抓取网页 while self.currentDepth < self.depth+1: #分配任务,线程池并发下载当前深度的所有页面(该操作不阻塞) self._assignCurrentDepthTasks () #等待当前线程池完成所有任务,当池内的所有任务完成时,即代表爬完了一个网页深度 #self.threadPool.taskJoin()可代替以下操作,可无法Ctrl-C Interupt while self.threadPool.getTaskLeft() > 0: print "Task left: ", self.threadPool.getTaskLeft() time.sleep(3) print 'Depth %d Finish. Totally visited %d links. \n' % ( self.currentDepth, len(self.visitedGroups)) log.info('Depth %d Finish. Total visited Links: %d\n' % ( self.currentDepth, len(self.visitedGroups))) self.currentDepth += 1 self.stop() assert(self.threadPool.getTaskLeft() == 0) print "Main Crawling procedure finished!" def stop(self): self.isCrawling = False self.threadPool.stopThreads() # save group ids to file for group_id in self.visitedGroups: self.groupfile.write(group_id + "\n") self.groupfile.close() self.database.close() def getAlreadyVisitedNum(self): #visitedGroups保存已经分配给taskQueue的链接,有可能链接还在处理中。 #因此真实的已访问链接数为visitedGroups数减去待访问的链接数 if len(self.visitedGroups) == 0: return 0 else: return len(self.visitedGroups) - self.threadPool.getTaskLeft() def _assignCurrentDepthTasks(self): """取出一个线程,并为这个线程分配任务,即抓取网页,并进行相应的访问控制 """ # 判断当前周期内访问的网页数目是否大于最大数目 if self.currentPeriodVisits > self.MAX_VISITS_PER_MINUTE - 1: # 等待所有的网页处理完毕 while self.threadPool.getTaskLeft() > 0: print "Waiting period ends..." time.sleep(1) timeNow = time.time() seconds = timeNow - self.periodStart if seconds < 60: # 如果当前还没有过一分钟,则sleep time.sleep(int(seconds + 3)) self.periodStart = time.time() # 重新设置开始时间 self.currentPeriodVisits = 0 # 从未访问的列表中抽出,并为其分配thread while len(self.unvisitedGroups) > 0: group_id = self.unvisitedGroups.popleft() #向任务队列分配任务 url = "http://www.douban.com/group/" + group_id + "/" self.threadPool.putTask(self._taskHandler, url) # 添加已经访问过的小组id self.visitedGroups.add(group_id) def _taskHandler(self, url): """ 根据指定的url,抓取网页 """ print "Visiting : " + url webPage = WebPage(url) # 抓取页面内容 flag = webPage.fetch() if flag: self.lock.acquire() #锁住该变量,保证操作的原子性 self.currentPeriodVisits += 1 self.lock.release() self._saveTaskResults(webPage) self._addUnvisitedGroups(webPage) return True # if page reading fails return False def _saveTaskResults(self, webPage): """将小组信息写入数据库 """ url, pageSource = webPage.getDatas() # 产生一个group对象 dbgroup = Group(url, pageSource) # 写入数据库 self.database.saveGroupInfo(dbgroup) def _addUnvisitedGroups(self, webPage): '''添加未访问的链接,并过滤掉非小组主页的链接。将有效的url放进UnvisitedGroups列表''' #对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次 url, pageSource = webPage.getDatas() hrefs = self._getAllHrefsFromPage(url, pageSource) for href in hrefs: #print "URLs in page: ", href match_obj = REGroup.match(href) # 只有满足小组主页链接格式的链接才会被处理 if self._isHttpOrHttpsProtocol(href) and (match_obj is not None): #pdb.set_trace() group_id = match_obj.group(1) #print "Group link: " + href if not self._isGroupRepeated(group_id): # 将小组id放入待访问的小组列表中去 print "Add group id:", group_id self.unvisitedGroups.append(group_id) def _getAllHrefsFromPage(self, url, pageSource): '''解析html源码,获取页面所有链接。返回链接列表''' hrefs = [] soup = BeautifulSoup(pageSource) results = soup.find_all('a',href=True) for a in results: #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf #在bs4中不会被自动url编码,从而导致encodeException href = a.get('href').encode('utf8') if not href.startswith('http'): href = urljoin(url, href)#处理相对链接的问题 hrefs.append(href) return hrefs def _isHttpOrHttpsProtocol(self, href): protocal = urlparse(href).scheme if protocal == 'http' or protocal == 'https': return True return False def _isGroupRepeated(self, group_id): if (group_id in self.visitedGroups) or (group_id in self.unvisitedGroups): return True return False def _isDatabaseAvaliable(self): if self.database.isConn(): return True return False def selfTesting(self, args): url = 'http://www.douban.com/group/insidestory/' print '\nVisiting http://www.douban.com/group/insidestory/' #测试网络,能否顺利获取百度源码 pageSource = WebPage(url).fetch() if pageSource == None: print 'Please check your network and make sure it\'s connected.\n' #数据库测试 elif not self._isDatabaseAvaliable(): print 'Please make sure you have the permission to save data: %s\n' % args.dbFile #保存数据 else: #self._saveTaskResults(url, pageSource) print 'Create logfile and database Successfully.' print 'Already save Baidu.com, Please check the database record.' print 'Seems No Problem!\n'