Python ThreadPool Examples, threadPool.ThreadPool Python Examples

Example #1

0

Show file

def Principal():
    thread = ThreadPool(10)
    while True:
        cnx, end = s.accept()
        print "o seguinte endereço se conectou: " + end[0]
        clientes.append(cnx)
        thread.insert_job(novo_cliente, cnx, end)

Example #2

0

Show file

def main():
    try:
        f = open(r'ip.txt', 'rb')
        ip = ''
        for line in f.readlines():
            final_ip = line.strip('\n')
            for i in get_ip_list(final_ip):
                print i
                ip += str(i).strip() + '\n'
        with open(r'scan_ip.txt', 'w') as ff:
            ff.write(ip)
        data = []
        items = portscan()  # 进行masscan跑端口
        dataList = {}

        for i in items:
            i = i.split('|')
            if i[1] not in dataList:
                dataList[str(i[1])] = []
            dataList[str(i[1])].append(i[0])
        for i in dataList:
            if len(dataList[i]) >= 50:
                for port in dataList[i]:
                    items.remove(str(port) + '|' + str(i))  # 删除超过50个端口的
        pool = ThreadPool(20, 1000)
        pool.start(
            NmapScan,
            items,
            data,
        )
    except Exception as e:
        print e
        pass

Example #3

0

Show file

File: post_id_crawler.py Project: hitalex/tianya-forum-crawler

 def __init__(self, start_url, thread_num, post_list_path, max_post_num = 1000):
     """
     `group_id`          待抓取的group id
     `thread_num`         抓取的线程
     `post_list_path`   保存所有的post id list的文件路径
     """
     #线程池,指定线程数
     self.thread_pool = ThreadPool(thread_num)
     # 保存topic的线程
     # NOTE: 这里只允许一个保存进程，因为要操作同一个文件
     self.save_thread = ThreadPool(1)
     
     # 保存group相关信息
     self.post_list_path = post_list_path
     
     # 已经访问的页面: Group id ==> True or False
     self.visited_href = set()
     #待访问的小组讨论页面
     self.unvisited_href = deque()
     # 访问失败的页面链接
     self.failed_href = set()
     
     self.start_url = start_url
     
     # 抓取结束有两种可能：1）抓取到的topic数目已经最大；2）已经将所有的topic全部抓取
     # 只保存thread-id
     self.post_list = list()
     
     self.is_crawling = False
     
     # 每个Group抓取的最大topic个数
     self.MAX_POST_NUM = max_post_num

Example #4

0

Show file

    def __init__(self, args=Strategy()):
        self.url = args.url
        self.max_depth = args.max_depth  #指定网页深度
        self.max_count = args.max_count  #爬行最大数量
        self.concurrency = args.concurrency  #线程数
        self.timeout = args.timeout  #超时
        self.cookies = args.cookies  #cookies
        self.ssl_verify = args.ssl_verify  #ssl
        self.same_host = args.same_host  #是否只抓取相同host的链接
        self.same_domain = args.same_domain  #是否只抓取相同domain的链接

        self.currentDepth = 1  #标注初始爬虫深度，从1开始
        self.keyword = args.keyword  #指定关键词,使用console的默认编码来解码

        self.threadPool = ThreadPool(args.concurrency)  #线程池,指定线程数

        self.visitedHrefs = set()  #已访问的链接
        self.unvisitedHrefs = deque()  #待访问的链接
        self.unvisitedHrefs.append(args.url)  #添加首个待访问的链接
        self.isCrawling = False  #标记爬虫是否开始执行任务

        self.file = BASEDIR + '/cache/crawler/' + genFilename(
            self.url) + '.txt'
        # print self.file
        # print 'args.url=\t',args.url

        #################
        #此句有问题
        self.database = Database(args.dbFile)  #数据库
        # print 'hehe'

        self.lock = Lock()

Example #5

0

Show file

class FileService():
    def __init__(self, ip, port, node_list, directory, timeout):
        self.check_file_service = CheckFileService(
            ip=ip,
            port=3001,
            nodes=node_list,
            timeout=timeout,
            directory=directory,
        )
        self.file_transfer_service = FileTransferService(ip=ip,
                                                         port=3002,
                                                         directory=directory)
        self.directory = directory
        self.threadpool = ThreadPool(2)

    def start_service(self):
        self.threadpool.add_task(self.check_file_service.start_server, ('', 0))
        self.threadpool.wait_completion()

    def get_file(self, file_path):
        fastest_node = self.check_file_service.process_get_file_request(
            file_path=file_path)
        requested_file = self.file_transfer_service.get_file_from_node(
            node_ip=fastest_node["node"],
            node_port=fastest_node["node_port"],
            file_path=file_path)
        self.save_file(requested_file)

    def save_file(self, file_path, file_to_be_saved):
        with open(file_path) as f:
            f.write(file_to_be_saved)

Example #6

0

Show file

File: proxy.py Project: flyninjia/python

    def saveProxies(self):
        #创建线程30个,并开启线程
        threadPool = ThreadPool(30)
        threadPool.startThreads()

        #调用类 读取数据
        #databases = database.DatabaseProxyIp()

        proxyip = self.proxyip_db.readData()
        #x循环读取数据进行匹配
        for proxy in proxyip:
            #把测试函数放入线程中
            threadPool.putTask(self.checkclientUrl, proxy[0])
            #threadPool.putTask(self.checkProxy, proxy[0])
            #flag,proxy = checkProxy(proxy[0])
        #循环获取测试结果,成功写入数据库,失败修改available为0
        ip_fail = 0
        ip_ok = 0
        ip_lock = 0
        while threadPool.getTaskLeft():
            flag, proxy = threadPool.getTaskResult()
            print flag, proxy
            if flag == 'ok':
                #print 'ok ', proxy
                self.proxyip_db.updateData(1, proxy)
                ip_ok = ip_ok + 1
            elif flag == 'lock':
                self.proxyip_db.updateData(0, proxy)
                ip_lock = ip_lock + 1
            else:
                self.proxyip_db.delData(proxy)
                ip_fail = ip_fail + 1

        print '====> available ip: ', ip_ok, ' , lock ip: ', ip_lock, ' , fail ip: ', ip_fail, ' <===='
        threadPool.stopThreads()

Example #7

0

Show file

    def __init__(self, pool, maxHostID, monitorInterval=2):
        self._messageTypes = {}
        # Save arguments
        self._stop = False
        self._stopped = False
        self._poolID = str(pool.spUUID)
        self._spmStorageDir = pool.storage_repository
        tpSize = config.getint('irs', 'thread_pool_size') / 2
        waitTimeout = 3
        maxTasks = config.getint('irs', 'max_tasks')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        #  *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice
        #                      versa *** #
        self._inbox = os.path.join(self._spmStorageDir, self._poolID,
                                   "mastersd", sd.DOMAIN_META_DATA, "inbox")
        if not os.path.exists(self._inbox):
            self.log.error("SPM_MailMonitor create failed - inbox %s does not "
                           "exist" % repr(self._inbox))
            raise RuntimeError("SPM_MailMonitor create failed - inbox %s does "
                               "not exist" % repr(self._inbox))
        self._outbox = os.path.join(self._spmStorageDir, self._poolID,
                                    "mastersd", sd.DOMAIN_META_DATA, "outbox")
        if not os.path.exists(self._outbox):
            self.log.error("SPM_MailMonitor create failed - outbox %s does "
                           "not exist" % repr(self._outbox))
            raise RuntimeError("SPM_MailMonitor create failed - outbox %s "
                               "does not exist" % repr(self._outbox))
        self._numHosts = int(maxHostID)
        self._outMailLen = MAILBOX_SIZE * self._numHosts
        self._monitorInterval = monitorInterval
        # TODO: add support for multiple paths (multiple mailboxes)
        self._outgoingMail = self._outMailLen * "\0"
        self._incomingMail = self._outgoingMail
        self._inCmd = ['dd',
                       'if=' + str(self._inbox),
                       'iflag=direct,fullblock',
                       'count=1'
                       ]
        self._outCmd = ['dd',
                        'of=' + str(self._outbox),
                        'oflag=direct',
                        'iflag=fullblock',
                        'conv=notrunc',
                        'count=1'
                        ]
        self._outLock = threading.Lock()
        self._inLock = threading.Lock()
        # Clear outgoing mail
        self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: "
                       "%s", self._outCmd)
        cmd = self._outCmd + ['bs=' + str(self._outMailLen)]
        (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail)
        if rc:
            self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, "
                             "dd failed")

        t = concurrent.thread(self.run, name="mailbox.SPMMonitor",
                              logger=self.log.name)
        t.start()
        self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)

Example #8

0

Show file

 def __init__(self, ip, port, initial_nodes, period):
     super(DiscoveryService, self).__init__(name='Discovery',
                                            ip=ip,
                                            port=port)
     self.period = period
     self.nodes = initial_nodes
     self.threadpool = ThreadPool(2)

Example #9

0

Show file

 def __init__(self, inbox, outbox, hostID, queue, monitorInterval):
     # Save arguments
     tpSize = config.getint('irs', 'thread_pool_size') / 2
     waitTimeout = 3
     maxTasks = config.getint('irs', 'max_tasks')
     self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
     self._stop = False
     self._flush = False
     self._queue = queue
     self._activeMessages = {}
     self._monitorInterval = monitorInterval
     self._hostID = int(hostID)
     self._used_slots_array = [0] * MESSAGES_PER_MAILBOX
     self._outgoingMail = EMPTYMAILBOX
     self._incomingMail = EMPTYMAILBOX
     # TODO: add support for multiple paths (multiple mailboxes)
     self._spmStorageDir = config.get('irs', 'repository')
     self._inCmd = [
         constants.EXT_DD, 'if=' + str(inbox), 'iflag=direct,fullblock',
         'bs=' + str(BLOCK_SIZE), 'count=' + str(BLOCKS_PER_MAILBOX),
         'skip=' + str(self._hostID * BLOCKS_PER_MAILBOX)
     ]
     self._outCmd = [
         constants.EXT_DD, 'of=' + str(outbox), 'iflag=fullblock',
         'oflag=direct', 'conv=notrunc', 'bs=' + str(BLOCK_SIZE),
         'seek=' + str(self._hostID * BLOCKS_PER_MAILBOX)
     ]
     self._init = False
     self._initMailbox()  # Read initial mailbox state
     self._msgCounter = 0
     self._sendMail()  # Clear outgoing mailbox
     self._thread = concurrent.thread(self.run,
                                      name="mailbox/hsm",
                                      logger=self.log.name)
     self._thread.start()

Example #10

0

Show file

 def __init__(self,
              tpSize=config.getfloat('irs', 'thread_pool_size'),
              waitTimeout=3,
              maxTasks=config.getfloat('irs', 'max_tasks')):
     self.storage_repository = config.get('irs', 'repository')
     self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
     self._tasks = {}
     self._unqueuedTasks = []

Example #11

0

Show file

 def __init__(self, args):
     self.depth = args.depth
     self.currentDepth = 1
     self.database = database(args.dbFile)
     self.threadPool = ThreadPool(args.threadNum)
     self.visitUrls = set()
     self.unvisitedUrls = deque()
     self.unvisitedUrls.append(args.url)
     self.isCrawling = False
     self.maxWebPages = args.maxWebPages

Example #12

0

Show file

 def __init__(self, args):
     self.depth = args.depth  
     self.currentDepth = 1  
     self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
     self.database =  Database(args.dbFile)
     self.threadPool = ThreadPool(args.threadNum)  
     self.visitedHrefs = set()   
     self.unvisitedHrefs = deque()    
     self.unvisitedHrefs.append(args.url) 
     self.isCrawling = False

Example #13

0

Show file

 def __init__(self, ip, port, nodes, timeout, directory,
              file_transfer_service):
     super(CheckFileService, self).__init__(name='CheckFile',
                                            ip=ip,
                                            port=port)
     self.nodes = nodes
     self.directory = directory
     self.client_socket.settimeout(timeout)
     self.threadpool = ThreadPool(2)
     self.file_transfer_service = file_transfer_service

Example #14

0

Show file

File: server.py Project: SD-CC-UFG/acquila.santos.sd.ufg

def clientThreadMain():
    ''' Cria-se 20 threads pre-alocadas'''
    thread = ThreadPool(20)
    ''' Laco principal do servidor '''
    while True:

        conexao, endereco = server.accept()

        print endereco[0] + " conectou!"
        ''' Quando um cliente se conecta, eh adicionado a uma lista de clientes (usado para o broadcast) '''
        clientes.append(conexao)
        thread.insert_job(newClient, conexao, endereco)

Example #15

0

Show file

 def __init__(self, ip, port, node_list, directory, timeout):
     self.check_file_service = CheckFileService(
         ip=ip,
         port=3001,
         nodes=node_list,
         timeout=timeout,
         directory=directory,
     )
     self.file_transfer_service = FileTransferService(ip=ip,
                                                      port=3002,
                                                      directory=directory)
     self.directory = directory
     self.threadpool = ThreadPool(2)

Example #16

0

Show file

File: comment_crawler.py Project: hitalex/crawler

    def __init__(self, groupID, topicIDList, threadNum, topic_info_path, comment_info_path):
        """
        `groupID` 当前的Group id
        `topicIDList` 需要抓取的topic id的list
        `threadNum` 开启的线程数目
        `topic_info_path` 存储topic信息的文件
        `comment_info_path` 存储comment信息的文件
        """
        
        #线程池,指定线程数
        self.threadPool = ThreadPool(threadNum)  
        # 写数据库的线程
        #self.DBThread = ThreadPool(1)
        # 保证同时只有一个线程在写文件
        self.saveThread = ThreadPool(1)
        
        self.database =  Database("DoubanGroup.db")
        #self.database =  Database("test.db")
        
        self.topic_info_path = topic_info_path
        self.comment_info_path = comment_info_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visitedHref = set()
        # 抓取失败的topic id
        self.failed = set()
        
        
        # 依次为每个小组抽取topic评论
        self.groupID = groupID
        self.topicIDList = topicIDList # 等待抓取的topic列表
        
        # 存储结果
        # topic ID ==> Topic对象
        self.topicDict = dict()
        # 存放下一个处理的评论页数： topic ID ==> 1,2,3...
        self.nextPage = dict()
        # 已经抓取完毕的topic id集合
        self.finished = set()
        
        
        self.visitedHref = set() # 已经访问的网页

        self.isCrawling = False
        
        # 每个topic抓取的最多comments个数
        #self.MAX_COMMETS_NUM = 5000
        self.MAX_COMMETS_NUM = float('inf')
        
        # 每页的评论数量
        self.COMMENTS_PER_PAGE = 100

Example #17

0

Show file

File: taskManager.py Project: ekohl/vdsm

class TaskManager:
    log = logging.getLogger('TaskManager')

    def __init__(self, tpSize=config.getfloat('irs', 'thread_pool_size'), waitTimeout=3, maxTasks=config.getfloat('irs', 'max_tasks')):
        self.storage_repository = config.get('irs', 'repository')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        self._tasks = {}
        self._unqueuedTasks = []


    def queue(self, task):
        return self._queueTask(task, task.commit)

    def queueRecovery(self, task):
        return self._queueTask(task, task.recover)

    def _queueTask(self, task, method):
        try:
            self.log.debug("queueing task: %s", task.id)
            self._tasks[task.id] = task
            if not self.tp.queueTask(task.id, method):
                self.log.error("unable to queue task: %s", task.dumpTask())
                del self._tasks[task.id]
                raise se.AddTaskError()
            self.log.debug("task queued: %s", task.id)
        except Exception, ex:
            self.log.error("Could not queue task, encountered: %s", str(ex))
            raise
        return task.id

Example #18

0

Show file

File: spider.py Project: jn61129052/thread

 def __init__(self, url, depth, threadNum, dbfile, key):
     #瑕佽幏鍙杣rl鐨勯槦鍒�
     self.urlQueue = Queue()
     #璇诲彇鐨刪tml闃熷垪
     self.htmlQueue = Queue()
     #宸茬粡璁块棶鐨剈rl
     self.readUrls = []
     #鏈闂殑閾炬帴
     self.links = []
     #绾跨▼鏁�
     self.threadNum = threadNum
     #鏁版嵁搴撴枃浠跺悕
     self.dbfile = dbfile
     #鍒涘缓瀛樺偍鏁版嵁搴撳璞�
     self.dataBase = SaveDataBase(self.dbfile)
     #鎸囩偣绾跨▼鏁扮洰鐨勭嚎绋嬫睜
     self.threadPool = ThreadPool(self.threadNum)
     #鍒濆鍖杣rl闃熷垪
     self.urlQueue.put(url)
     #鍏抽敭瀛�浣跨敤console鐨勯粯璁ょ紪鐮佹潵瑙ｇ爜
     self.key = key.decode(getdefaultlocale()[1])
     #鐖娣卞害
     self.depth = depth
     #褰撳墠鐖娣卞害
     self.currentDepth = 1
     #褰撳墠绋嬪簭杩愯鐘舵�
     self.state = False

Example #19

0

Show file

File: crawler.py Project: a3587556/Hammer

	def __init__(self, args=Strategy()):
		self.url = args.url 				
		self.max_depth = args.max_depth  	#指定网页深度
		self.max_count = args.max_count		#爬行最大数量
		self.concurrency = args.concurrency	#线程数
		self.timeout = args.timeout			#超时
		self.cookies = args.cookies 		#cookies
		self.ssl_verify = args.ssl_verify 	#ssl
		self.same_host = args.same_host		#是否只抓取相同host的链接
		self.same_domain = args.same_domain	#是否只抓取相同domain的链接

		self.currentDepth = 1  				#标注初始爬虫深度，从1开始
		self.keyword = args.keyword		 	#指定关键词,使用console的默认编码来解码
		

		self.threadPool = ThreadPool(args.concurrency)  #线程池,指定线程数
		
		self.visitedHrefs = set()   		#已访问的链接
		self.unvisitedHrefs = deque()		#待访问的链接 
		self.unvisitedHrefs.append(args.url)#添加首个待访问的链接
		self.isCrawling = False				#标记爬虫是否开始执行任务

		self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt'
		print self.file
		print 'args.url=\t',args.url

		#################
		#此句有问题
		self.database =  Database(args.dbFile)			#数据库
		# print 'hehe'

		self.lock = Lock()

Example #20

0

Show file

File: spider.py Project: kitochou/spider.py

 def __init__(self, url, depth, threadNum, dbfile, key):
     #要获取url的队列
     self.urlQueue = Queue()
     #读取的html队列
     self.htmlQueue = Queue()
     #已经访问的url
     self.readUrls = []
     #未访问的链接
     self.links = []
     #线程数
     self.threadNum = threadNum
     #数据库文件名
     self.dbfile = dbfile
     #创建存储数据库对象
     self.dataBase = SaveDataBase(self.dbfile)
     #指点线程数目的线程池
     self.threadPool = ThreadPool(self.threadNum)
     #初始化url队列
     self.urlQueue.put(url)
     #关键字,使用console的默认编码来解码
     self.key = key.decode(getdefaultlocale()[1])
     #爬行深度
     self.depth = depth
     #当前爬行深度
     self.currentDepth = 1
     #当前程序运行状态
     self.state = False

Example #21

0

Show file

File: storage_mailbox.py Project: fancyKai/vdsm

    def __init__(self, pool, maxHostID, monitorInterval=2):
        self._messageTypes = {}
        # Save arguments
        self._stop = False
        self._stopped = False
        self._poolID = str(pool.spUUID)
        self._spmStorageDir = pool.storage_repository
        tpSize = config.getint('irs', 'thread_pool_size') / 2
        waitTimeout = 3
        maxTasks = config.getint('irs', 'max_tasks')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        #  *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice
        #                      versa *** #
        self._inbox = os.path.join(self._spmStorageDir, self._poolID,
                                   "mastersd", sd.DOMAIN_META_DATA, "inbox")
        if not os.path.exists(self._inbox):
            self.log.error("SPM_MailMonitor create failed - inbox %s does not "
                           "exist" % repr(self._inbox))
            raise RuntimeError("SPM_MailMonitor create failed - inbox %s does "
                               "not exist" % repr(self._inbox))
        self._outbox = os.path.join(self._spmStorageDir, self._poolID,
                                    "mastersd", sd.DOMAIN_META_DATA, "outbox")
        if not os.path.exists(self._outbox):
            self.log.error("SPM_MailMonitor create failed - outbox %s does "
                           "not exist" % repr(self._outbox))
            raise RuntimeError("SPM_MailMonitor create failed - outbox %s "
                               "does not exist" % repr(self._outbox))
        self._numHosts = int(maxHostID)
        self._outMailLen = MAILBOX_SIZE * self._numHosts
        self._monitorInterval = monitorInterval
        # TODO: add support for multiple paths (multiple mailboxes)
        self._outgoingMail = self._outMailLen * "\0"
        self._incomingMail = self._outgoingMail
        self._inCmd = ['dd',
                       'if=' + str(self._inbox),
                       'iflag=direct,fullblock',
                       'count=1'
                       ]
        self._outCmd = ['dd',
                        'of=' + str(self._outbox),
                        'oflag=direct',
                        'iflag=fullblock',
                        'conv=notrunc',
                        'count=1'
                        ]
        self._outLock = threading.Lock()
        self._inLock = threading.Lock()
        # Clear outgoing mail
        self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: "
                       "%s", self._outCmd)
        cmd = self._outCmd + ['bs=' + str(self._outMailLen)]
        (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail)
        if rc:
            self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, "
                             "dd failed")

        t = concurrent.thread(self.run, name="mailbox.SPMMonitor",
                              logger=self.log.name)
        t.start()
        self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)

Example #22

0

Show file

File: taskManager.py Project: vikas-lamba/vdsm

class TaskManager:
    log = logging.getLogger('TaskManager')

    def __init__(self, tpSize=config.getfloat('irs', 'thread_pool_size'), waitTimeout=3, maxTasks=config.getfloat('irs', 'max_tasks')):
        self.storage_repository = config.get('irs', 'repository')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        self._tasks = {}
        self._unqueuedTasks = []


    def queue(self, task):
        return self._queueTask(task, task.commit)

    def queueRecovery(self, task):
        return self._queueTask(task, task.recover)

    def _queueTask(self, task, method):
        try:
            self.log.debug("queueing task: %s", task.id)
            self._tasks[task.id] = task
            if not self.tp.queueTask(task.id, method):
                self.log.error("unable to queue task: %s", task.dumpTask())
                del self._tasks[task.id]
                raise se.AddTaskError()
            self.log.debug("task queued: %s", task.id)
        except Exception, ex:
            self.log.error("Could not queue task, encountered: %s", str(ex))
            raise
        return task.id

Example #23

0

Show file

File: crawler.py Project: fakegit/a-super-fast-crawler

 def __init__(self, args, queue):
     threading.Thread.__init__(self)
     #指定网页深度
     self.depth = args['depth']
     #标注初始爬虫深度，从1开始
     self.currentDepth = 1
     #指定关键词,使用console的默认编码来解码
     self.keyword = args['keyword'].decode(getdefaultlocale()[1])
     #数据库
     self.database =  Database(db="bt_tornado")
     #线程池,指定线程数
     self.threadPool = ThreadPool(args['threadNum'])
     #已访问的链接
     self.visitedHrefs = set()
     #待访问的链接
     self.unvisitedHrefs = deque()
     #添加待访问的链接
     for url in args['url']:
         self.unvisitedHrefs.append(url)
     #标记爬虫是否开始执行任务
     self.isCrawling = False
     # allow or deny crawl url
     self.entryFilter = args['entryFilter']
     # allow to output back url
     self.yieldFilter = args['yieldFilter']
     #
     self.callbackFilter = args['callbackFilter']
     #
     self.db = args['db']
     self.collection = args['collection']
     # communication queue
     self.queue = queue

Example #24

0

Show file

File: spider.py Project: tntC4stl3/PythonCrawler

	def __init__(self, args):
		# 抓取深度
		self.max_deepth = args['deepth']
		# 指定当前深度
		self.current_deepth = 1
		# 线程管理
		self.threadPool = ThreadPool(args['threads'])
		# 指定存取数据库文件
		self.dbfile = args['dbfile']
		# 指定关键字
		self.keyword = args['keyword']
		# 是否自测
		self.testself = args['testself']
		# 当前层待访问的链接，用集合来去重
		self.unvisitedUrl = set()
		self.unvisitedUrl.add(args['url'])
		# 已访问的链接
		self.visitedUrl = set()
		self.q = Queue()
		# http header
		self.header = {
			'Accetpt': 'text/html,application/xhtml+xml,application/xml;q=0.9,*/*;q=0.8',
			'Accetpt-Encoding': 'gzip,deflate,sdch',
			'Connection': 'keep-alive',
			'User-Agent': 'Mozilla/5.0 (Windows NT 6.1; WOW64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/29.0.1547.76 Safari/537.36'
		}
		# 连接数据库
		self.connDB()

		self.isRunning = True

Example #25

0

Show file

File: taskManager.py Project: humblec/vdsm

 def __init__(self,
              tpSize=config.getint('irs', 'thread_pool_size'),
              waitTimeout=3,
              maxTasks=config.getint('irs', 'max_tasks')):
     self.storage_repository = config.get('irs', 'repository')
     self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
     self._tasks = {}
     self._unqueuedTasks = []

Example #26

0

Show file

File: comment_crawler.py Project: hitalex/douban-group-crawler

    def __init__(self, group_id, topic_id_list, thread_num, base_path, topic_info_path, comment_info_path):
        """
        `group_id` 当前的Group id
        `topic_id_list` 需要抓取的topic id的list
        `thread_num` 开启的线程数目
        `topic_info_path` 存储topic信息的文件
        `comment_info_path` 存储comment信息的文件
        """
        
        #线程池,指定线程数
        self.thread_pool = ThreadPool(thread_num)

        # 由于现在是将不同的topic信息保存到不同的文件中，所以可以同时存储
        self.save_thread = ThreadPool(10)
        
        self.topic_info_path = topic_info_path
        self.comment_info_path = comment_info_path
        self.base_path = base_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visited_href = set()
        # 抓取失败的topic id
        self.failed = set()
        
        
        # 依次为每个小组抽取topic评论
        self.group_id = group_id
        self.topic_id_list = topic_id_list # 等待抓取的topic列表
        
        # 存储结果
        # topic ID ==> Topic对象
        self.topic_dict = dict()
        # 存放下一个处理的评论页数： topic ID ==> 1,2,3...
        self.next_page = dict()
        # 已经抓取完毕的topic id集合
        self.finished = set()

        self.is_crawling = False
        
        # 每个topic抓取的最多comments个数
        #self.MAX_COMMETS_NUM = 5000
        self.MAX_COMMETS_NUM = float('inf')
        
        # 每页的评论数量
        self.COMMENTS_PER_PAGE = 100

Example #27

0

Show file

File: crawler.py Project: liangsheng/crawler_my

 def __init__(self, args):
     #指定网页深度
     self.depth = args.depth
     #表示爬虫深度，从1开始
     self.currentDepth = 1
     #数据库
     self.database = Database(args.dbFile)
     #线程池，指定线程数
     self.threadPool = ThreadPool(args.threadNum)
     #已经访问的链接
     self.visitedHrefs = set()
     #待访问的页面
     self.unvisitedHrefs = deque()
     #首个待访问的页面
     self.url = args.url
     self.unvisitedHrefs.append(args.url)
     #标记爬虫是否开始执行
     self.isCrawling = False

Example #28

0

Show file

 def testThreadPool(self):
     allTheThreads = []
     with ThreadPool( 10 ) as tp: 
         for i in range(200):
             w = MockWorker( None, None, None, None, f"Thread {i}" )
             allTheThreads.append( w )
             tp.addWorker( w )
     for thread in allTheThreads: 
         self.assertFalse( thread.is_alive() )

Example #29

0

Show file

File: urlFetcher.py Project: JackyXiong/PicCrawler

 def __init__(self,url,threadnum,limit):
     #self.database = Database('pichref.sql')
     self.file = PicFile('imgfile','a')
     self.threadPool = ThreadPool(threadnum)
     self.unaccesshref = deque()#双向列表
     self.accessedhref = set()#已访问的链接集合
     self.unaccesshref.append(url)#添加初始链接
     self.limit = limit
     self.picUrlCount = 1

Example #30

0

Show file

File: picCrawler.py Project: JackyXiong/PicCrawler

 def __init__(self,threadnum,pathname,limit):
     '''limit指定图片数目，path指定存放路径'''
     super(Crawler, self).__init__()
     self.threadPool = ThreadPool(threadnum)
     self.file = PicFile('imgfile','r')
     self.urlqueue = deque()
     self.count = 1
     self._makePath(pathname)
     self.savaPath = os.getcwd()+'/'+pathname
     self._getUrl(limit)

Example #31

0

Show file

File: crawler.py Project: derekdomo/Web-Crawling

 def __init__(self, args):
     self.depth = args.depth
     self.currentDepth = 1
     self.database = database(args.dbFile)
     self.threadPool = ThreadPool(args.threadNum)
     self.visitUrls = set()
     self.unvisitedUrls = deque()
     self.unvisitedUrls.append(args.url)
     self.isCrawling = False
     self.maxWebPages = args.maxWebPages

Example #32

0

Show file

File: taskManager.py Project: openSUSE/vdsm

 def __init__(
     self,
     tpSize=config.getfloat("irs", "thread_pool_size"),
     waitTimeout=3,
     maxTasks=config.getfloat("irs", "max_tasks"),
 ):
     self.storage_repository = config.get("irs", "repository")
     self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
     self._tasks = {}
     self._unqueuedTasks = []

Example #33

0

Show file

 def __init__(self, args):
     #指定网页深度
     self.depth = args.depth
     #标注初始爬虫深度，从1开始
     self.currentDepth = 1
     #指定关键词,使用console的默认编码来解码
     self.keyword = args.keyword.decode(getdefaultlocale()[1])
     #数据库
     self.database = Database()
     #线程池,指定线程数
     self.threadPool = ThreadPool(args.threadNum)
     #已访问的链接
     self.visitedHrefs = set()
     #待访问的链接
     self.unvisitedHrefs = deque()
     #添加首个待访问的链接
     self.unvisitedHrefs.append(args.url)
     #标记爬虫是否开始执行任务
     self.isCrawling = False

Example #34

0

Show file

File: fileTransfer.py Project: MSadeghzadehG/py-netflow

class FileTransferService(TCPService):
    def __init__(self, ip, port, directory):
        super(FileTransferService, self).__init__(name='FileTransfer',
                                                  ip=ip,
                                                  port=port)
        self.directory = directory
        self.threadpool = ThreadPool(2)

    def start_service(self):
        self.threadpool.add_task(self.start_server, ('', 0))
        self.threadpool.wait_completion()

    def process_server_response(self, message, address):
        needed_file = open(self.directory + '/' + file_path)

    def get_server_port(self):
        return self.server_socket.getsockname()[1]

    def get_file_from_node(self, node_ip, node_port, file_path):
        pass

Example #35

0

Show file

File: topic_crawler.py Project: hitalex/douban-group-crawler

    def __init__(self, group_id, thread_num, group_info_path, topic_list_path, max_topics_num = 1000):
        """
        `group_id`          待抓取的group id
        `thread_num`         抓取的线程
        `group_info_path`   存储group本身的信息文件路径
        `topic_list_path`   保存所有的topic id list的文件路径
        """
        #线程池,指定线程数
        self.thread_pool = ThreadPool(thread_num)
        # 保存topic的线程
        self.save_thread = ThreadPool(1)

        # 写数据库的线程
        #self.DBThread = ThreadPool(1)
                
        # 保存group相关信息
        self.group_info_path = group_info_path
        self.topic_list_path = topic_list_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visited_href = set()
        #待访问的小组讨论页面
        self.unvisited_href = deque()
        # 访问失败的页面链接
        self.failed_href = set()
        
        self.lock = Lock() #线程锁
        
        self.group_id = group_id
        self.group_info = None # models.Group
        
        # 抓取结束有两种可能：1）抓取到的topic数目已经最大；2）已经将所有的topic全部抓取
        # 只保存topic id
        self.topic_list = list()

        self.is_crawling = False
        
        # self.database =  Database("DoubanGroup.db")
        
        # 每个Group抓取的最大topic个数
        self.MAX_TOPICS_NUM = max_topics_num

Example #36

0

Show file

    def __init__(self, dbName, threadNum, logLevel, startUrls, depth, keyword, downloadMode):
        self.__threadNum = threadNum
        self.__startUrls = startUrls
        self.__depth = depth
        self.__keyword = keyword
        self.__downloadMode = downloadMode
        self.__dbName = dbName
        self.__logLevel = logLevel
        
        self.__exitEvent = threading.Event()
        # url队列存储待下载的url节点
        self.__urlQueue = Queue.Queue()
        # html队列存储已经下载完成等待解析的html节点
        self.__htmlQueue = Queue.Queue()
        # data队列存储已解析完成并符合存入数据库条件的html节点
        self.__dataQueue = Queue.Queue()
        # 存储为各个下载模块分配的下载队列
        self.__downloadQueueList = []
	# 创建线程池
        self.__threadPool = ThreadPool(threadNum + 2)
        self.__downloadingFlag = 0

Example #37

0

Show file

def saveProxies():
    threadPool = ThreadPool(30)
    threadPool.startThreads()

    proxyFileOK = open('proxyOK.txt', 'a')
    proxyFileFail = open('proxyFail.txt', 'a')
    for proxy in proxiex:
        threadPool.putTask(checkProxy, proxy)
    while threadPool.getTaskLeft():
        flag, proxy = threadPool.getTaskResult()
        print flag, proxy
        if flag == 'ok':
            proxyFileOK.write(proxy)
            proxyFileOK.write('\n')
        else:
            proxyFileFail.write(proxy)
            proxyFileFail.write('\n')

    threadPool.stopThreads()
    proxyFileOK.close()
    proxyFileFail.close()

Example #38

0

Show file

File: comment_crawler.py Project: hitalex/tianya-forum-crawler

    def __init__(self, section_id, post_id_list, crawler_thread_num, save_thread_num, post_base_path):
        """
        `section_id` 天涯的板块名称
        `post_id_list` 需要抓取的post id的list
        `thread_num` 开启的线程数目
        post_base_path: 存储抓取结果的基本目录，每个post一个文件，并以该post的ID命名
        """
        # 抓取网页的线程池,指定线程数
        self.thread_pool = ThreadPool(crawler_thread_num)
        # 由于现在是将不同的topic信息保存到不同的文件中，所以可以同时存储
        self.save_thread = ThreadPool(save_thread_num)
        
        # 保存抓取信息的base path
        self.base_path = post_base_path
        
        # 已经访问的页面: Group id ==> True or False
        self.visited_href = set()
        self.visited_post = set() # 已经添加访问的页面的id集合
        self.finished = set() # 已经抓取完毕的topic id集合
        
        # 抓取失败的topic id
        self.failed = set()
        
        # 依次为每个小组抽取topic评论
        self.section_id = section_id
        self.post_id_list = post_id_list # 等待抓取的topic列表
        self.current_post_id_list = list(post_id_list) # 用于逐步向任务列表中加入post id
        
        # 存储结果
        # topic ID ==> Topic对象
        self.post_dict = dict()
        # 存放下一个处理的评论页数： topic ID ==> 1,2,3...
        self.next_page = dict()

        self.is_crawling = False
        
        # 每个topic抓取的最多comments个数
        #self.MAX_COMMETS_NUM = 1000
        self.MAX_COMMETS_NUM = float('inf')

Example #39

0

Show file

def main():
    node_list = ["0.0.0.0", "localhost"]
    main_pool = ThreadPool(3)
    discovery_service = DiscoveryService(ip="127.0.0.1",
                                         port=3000,
                                         initial_nodes=node_list,
                                         period=5)
    file_service = FileService(ip="127.0.0.1",
                               port=3001,
                               node_list=node_list,
                               directory='files/',
                               timeout=5)

    main_pool.add_task(discovery_service.start_service)
    main_pool.add_task(file_service.start_service)
    main_pool.wait_completion()

Example #40

0

Show file

File: proxy.py Project: fakegit/a-super-fast-crawler

def saveProxies():
    threadPool = ThreadPool(30)
    threadPool.startThreads()

    proxyFileOK = open('proxyOK.txt','a')
    proxyFileFail = open('proxyFail.txt','a')
    for proxy in proxiex:
        threadPool.putTask(checkProxy, proxy)
    while threadPool.getTaskLeft():
        flag, proxy = threadPool.getTaskResult()
        print flag, proxy
        if flag == 'ok':
            proxyFileOK.write(proxy)
            proxyFileOK.write('\n')
        else:
            proxyFileFail.write(proxy)
            proxyFileFail.write('\n')

    threadPool.stopThreads()
    proxyFileOK.close()
    proxyFileFail.close()

Example #41

0

Show file

File: picCrawler.py Project: JackyXiong/PicCrawler

class Crawler(object):
    
    def __init__(self,threadnum,pathname,limit):
        '''limit指定图片数目，path指定存放路径'''
        super(Crawler, self).__init__()
        self.threadPool = ThreadPool(threadnum)
        self.file = PicFile('imgfile','r')
        self.urlqueue = deque()
        self.count = 1
        self._makePath(pathname)
        self.savaPath = os.getcwd()+'/'+pathname
        self._getUrl(limit)

    '''当前目录下创建指定目录'''
    def _makePath(self,pathname):
        if not os.path.isdir(os.getcwd()+'/'+pathname):
            os.mkdir(os.getcwd()+'/'+pathname)
        else:
            pass

    '''从文件取出 URL 到双向列表'''
    def _getUrl(self,num):
        while len(self.urlqueue) < num:
            self.urlqueue.append(self.file.getData().rstrip('\n'))
        self.file.close()
        
    def start(self):
        print '---start downloading picture---'
        self.threadPool.startThreads()
        while self.urlqueue!=deque([]):
            self.threadPool.putTask(self._handleTask,self.urlqueue.popleft())
        self.stop()

    def stop(self):
        self.threadPool.stopThreads()
        print '---end downloading picture---'

    '''任务处理'''
    def _handleTask(self,url):
        self._download(url)
    
    '''下载图片,以数字升序命名'''
    def _download(self,url):
        retry = 2 
        try:
            r = requests.get(url)
            with open(self.savaPath +'/'+str(self.count)+'.jpg','wb') as jpg:
                jpg.write(r.content)
                self.count+=1
            print url
        except Exception,e:
            if retry > 0:
                retry = retry - 1
                self._download(url)

Example #42

0

Show file

 def start(self):
     with ThreadPool( self.max_jobs ) as tp :
         for url_to_visit in self.urls_provider : 
             if not self.exclusions.isExcluded( url_to_visit ) :
                 logging.info( f"visiting url {url_to_visit.value}..." )
                 try:
                     self._waitUntilWorkingHour()
                     w = Worker( self.user_agent,
                                 self.sentenceProcessor, 
                                 self.urlProcessor, 
                                 self.webSiteInfoProvider,
                                 self.MINIMUM_WORDS_PER_SENTENCE, 
                                 url_to_visit.value )
                     tp.addWorker( w )
                 except Exception as ex:
                     logging.error( f"Error fetching url {url_to_visit.value}") 
                     logging.error( ex )

Example #43

0

Show file

File: crawler.py Project: liangsheng/crawler_my

 def __init__(self, args):
     #指定网页深度
     self.depth = args.depth
     #表示爬虫深度，从1开始
     self.currentDepth = 1
     #数据库
     self.database = Database(args.dbFile)
     #线程池，指定线程数
     self.threadPool = ThreadPool(args.threadNum)
     #已经访问的链接
     self.visitedHrefs = set()
     #待访问的页面
     self.unvisitedHrefs = deque()
     #首个待访问的页面
     self.url = args.url
     self.unvisitedHrefs.append(args.url)
     #标记爬虫是否开始执行
     self.isCrawling = False

Example #44

0

Show file

File: crawler.py Project: WiseDoge/crawler

 def __init__(self, args):
     #指定网页深度
     self.depth = args.depth  
     #标注初始爬虫深度，从1开始
     self.currentDepth = 1  
     #指定关键词,使用console的默认编码来解码
     self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
     #数据库
     self.database =  Database(args.dbFile)
     #线程池,指定线程数
     self.threadPool = ThreadPool(args.threadNum)  
     #已访问的链接
     self.visitedHrefs = set()   
     #待访问的链接 
     self.unvisitedHrefs = deque()    
     #添加首个待访问的链接
     self.unvisitedHrefs.append(args.url) 
     #标记爬虫是否开始执行任务
     self.isCrawling = False

Example #45

0

Show file

File: crawl_title.py Project: hitalex/crawler

def main():
    threadPool = ThreadPool(5)
    threadPool.startThreads()
    
    f = codecs.open('tables/TopicInfo-all.txt', 'r', 'utf-8') # 读入unicode字符
    count = 0
    for line in f:
        line = line.strip()
        seg_list = line.split('[=]')
        if seg_list[1] == 'ustv':
            threadPool.putTask(task_handler, seg_list[0], seg_list)
            count += 1
        
    f.close()
    while threadPool.getTaskLeft() > 0:
        time.sleep(10)
        print 'Waiting to finish. Task left: %d' % threadPool.getTaskLeft()
        
    log.info('Number of topics in ustv: %d' % count)

Example #46

0

Show file

File: storage_mailbox.py Project: carriercomm/vdsm

 def __init__(self, inbox, outbox, hostID, queue, monitorInterval):
     # Save arguments
     tpSize = config.getint('irs', 'thread_pool_size') / 2
     waitTimeout = 3
     maxTasks = config.getint('irs', 'max_tasks')
     self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
     self._stop = False
     self._flush = False
     self._queue = queue
     self._activeMessages = {}
     self._monitorInterval = monitorInterval
     self._hostID = int(hostID)
     self._used_slots_array = [0] * MESSAGES_PER_MAILBOX
     self._outgoingMail = EMPTYMAILBOX
     self._incomingMail = EMPTYMAILBOX
     # TODO: add support for multiple paths (multiple mailboxes)
     self._spmStorageDir = config.get('irs', 'repository')
     self._inCmd = [constants.EXT_DD,
                    'if=' + str(inbox),
                    'iflag=direct,fullblock',
                    'bs=' + str(BLOCK_SIZE),
                    'count=' + str(BLOCKS_PER_MAILBOX),
                    'skip=' + str(self._hostID * BLOCKS_PER_MAILBOX)
                    ]
     self._outCmd = [constants.EXT_DD,
                     'of=' + str(outbox),
                     'iflag=fullblock',
                     'oflag=direct',
                     'conv=notrunc',
                     'bs=' + str(BLOCK_SIZE),
                     'seek=' + str(self._hostID * BLOCKS_PER_MAILBOX)
                     ]
     self._init = False
     self._initMailbox()  # Read initial mailbox state
     self._msgCounter = 0
     self._sendMail()  # Clear outgoing mailbox
     threading.Thread.__init__(self)
     self.daemon = True
     self.name = "mailbox.HSMMonitor"
     self.start()

Example #47

0

Show file

File: crawler.py Project: hitalex/crawler

    def __init__(self, args, startURLs):
        #指定网页深度
        self.depth = args.depth  
        #标注初始爬虫深度，从1开始
        self.currentDepth = 1  
        #指定关键词,使用console的默认编码来解码
        #self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
        #数据库
        self.database =  Database(args.dbFile)
        # store group ids to fils, using UTF-8
        self.groupfile = codecs.open("GroupID.txt", "w", "UTF-8")
        #线程池,指定线程数
        self.threadPool = ThreadPool(args.threadNum)  
        #已访问的小组id
        self.visitedGroups = set()   
        #待访问的小组id
        self.unvisitedGroups = deque()
        
        # 所有的Group信息
        self.groupInfo = []
        
        self.lock = Lock() #线程锁

        #标记爬虫是否开始执行任务
        self.isCrawling = False
        # 添加尚未访问的小组首页
        for url in startURLs:
            match_obj = REGroup.match(url)
            print "Add start urls:", url
            assert(match_obj != None)
            self.unvisitedGroups.append(match_obj.group(1))
        
        # 一分钟内允许的最大访问次数
        self.MAX_VISITS_PER_MINUTE = 10
        # 当前周期内已经访问的网页数量
        self.currentPeriodVisits = 0
        # 将一分钟当作一个访问周期，记录当前周期的开始时间
        self.periodStart = time.time() # 使用当前时间初始化

Example #48

0

Show file

File: crawler.py Project: ahlfors/AdsCrawler

    def __init__(self, args):
        #指定网页深度
        self.depth = args.depth  
        #标注初始爬虫深度，从1开始
        self.currentDepth = 1  
        #指定关键词,使用console的默认编码来解码
        self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
        #数据库
        self.database =  Database(args.dbFile)
        #线程池,指定线程数
        self.threadPool = ThreadPool(args.threadNum)  
        #已访问的链接
        self.visitedHrefs = set()   
        #待访问的链接 
        self.unvisitedHrefs = deque()    
        #添加首个待访问的链接
        #self.unvisitedHrefs.append(args.url) 
        #标记爬虫是否开始执行任务
        self.isCrawling = False

        self.domainPattern = re.compile(r"^([0-9a-zA-Z][0-9a-zA-Z-]{0,62}\.)+([0-9a-zA-Z][0-9a-zA-Z-]{0,62})\.?$")
        self.maxDomainSeeds = args.maxDomainSeeds
        self._initDomainSeedsList(args.domainSeeds)

Example #49

0

Show file

File: cordSubscriberTest.py Project: sbusa/cord-tester

 def subscriber_join_verify( self, num_subscribers = 10, num_channels = 1,
                             channel_start = 0, cbs = None, port_list = []):
     self.test_status = False
     self.num_subscribers = num_subscribers
     self.subscriber_load(create = True, num = num_subscribers,
                          num_channels = num_channels, channel_start = channel_start, port_list = port_list)
     self.onos_aaa_load()
     self.thread_pool = ThreadPool(min(100, self.num_subscribers), queue_size=1, wait_timeout=1)
     chan_leave = False #for single channel, multiple subscribers
     if cbs is None:
           cbs = (self.tls_verify, self.dhcp_verify, self.igmp_verify, self.traffic_verify)
           chan_leave = True
     for subscriber in self.subscriber_list:
           subscriber.start()
           pool_object = subscriber_pool(subscriber, cbs)
           self.thread_pool.addTask(pool_object.pool_cb)
     self.thread_pool.cleanUpThreads()
     for subscriber in self.subscriber_list:
           subscriber.stop()
           if chan_leave is True:
                 subscriber.channel_leave(0)
     self.num_subscribers = 0
     return self.test_status

Example #50

0

Show file

class DiscoveryService(UDPService):
    def __init__(self, ip, port, initial_nodes, period):
        super(DiscoveryService, self).__init__(name='Discovery',
                                               ip=ip,
                                               port=port)
        self.period = period
        self.nodes = initial_nodes
        self.threadpool = ThreadPool(2)

    def start_service(self):
        self.threadpool.add_task(self.discovery_job)
        self.threadpool.add_task(self.start_server)
        self.threadpool.wait_completion()

    def discovery_job(self):
        while True:
            self.send_nodes_to_others()
            sleep(self.period)

    def update_nodes(self, discovered_list):
        for discovered_node in discovered_list:
            if (discovered_node not in self.nodes
                    and discovered_node is not self.ip):
                self.nodes.append(discovered_node)

    def process_server_response(self, message, address):
        self.update_nodes(ast.literal_eval(message))
        print("Nodes list updated by Discovery service's server: " +
              str(self.nodes))

    def send_nodes_to_others(self):
        message = str(self.nodes)
        for node in self.nodes:
            self.send_message(str_message=message,
                              address=(node, self.port),
                              socket=self.client_socket)

Example #51

0

Show file

File: cordSubscriberTest.py Project: sbusa/cord-tester

class subscriber_exchange(unittest.TestCase):

      apps = ('org.opencord.aaa', 'org.onosproject.dhcp')
      olt_apps = () #'org.opencord.cordmcast')
      table_app = 'org.ciena.cordigmp'
      dhcp_server_config = {
        "ip": "10.1.11.50",
        "mac": "ca:fe:ca:fe:ca:fe",
        "subnet": "255.255.252.0",
        "broadcast": "10.1.11.255",
        "router": "10.1.8.1",
        "domain": "8.8.8.8",
        "ttl": "63",
        "delay": "2",
        "startip": "10.1.11.51",
        "endip": "10.1.11.100"
      }

      aaa_loaded = False
      test_path = os.path.dirname(os.path.realpath(__file__))
      table_app_file = os.path.join(test_path, '..', 'apps/ciena-cordigmp-multitable-2.0-SNAPSHOT.oar')
      app_file = os.path.join(test_path, '..', 'apps/ciena-cordigmp-2.0-SNAPSHOT.oar')
      onos_config_path = os.path.join(test_path, '..', 'setup/onos-config')
      olt_conf_file = os.path.join(test_path, '..', 'setup/olt_config.json')
      cpqd_path = os.path.join(test_path, '..', 'setup')
      ovs_path = cpqd_path
      test_services = ('IGMP', 'TRAFFIC')
      num_joins = 0
      num_subscribers = 0
      num_channels = 0
      recv_timeout = False
      onos_restartable = not bool(int(os.getenv('ONOS_RESTART_DISABLED', 0)))

      @classmethod
      def load_device_id(cls):
            '''Configure the device id'''
            did = OnosCtrl.get_device_id()
            #Set the default config
            cls.device_id = did
            cls.device_dict = { "devices" : {
                        "{}".format(did) : {
                              "basic" : {
                                    "driver" : "pmc-olt"
                                    }
                              }
                        },
                  }
            return did

      @classmethod
      def setUpClass(cls):
          '''Load the OLT config and activate relevant apps'''
          did = cls.load_device_id()
          network_cfg = { "devices" : {
                  "{}".format(did) : {
                        "basic" : {
                              "driver" : "pmc-olt"
                              }
                        }
                  },
          }
          ## Restart ONOS with cpqd driver config for OVS
          cls.start_onos(network_cfg = network_cfg)
          cls.install_app_table()
          cls.olt = OltConfig(olt_conf_file = cls.olt_conf_file)
          OnosCtrl.cord_olt_config(cls.olt.olt_device_data())
          cls.port_map, cls.port_list = cls.olt.olt_port_map()
          cls.activate_apps(cls.apps + cls.olt_apps)

      @classmethod
      def tearDownClass(cls):
          '''Deactivate the olt apps and restart OVS back'''
          apps = cls.olt_apps + ( cls.table_app,)
          for app in apps:
              onos_ctrl = OnosCtrl(app)
              onos_ctrl.deactivate()
          cls.uninstall_app_table()
          cls.start_onos(network_cfg = {})

      @classmethod
      def activate_apps(cls, apps):
            for app in apps:
                  onos_ctrl = OnosCtrl(app)
                  status, _ = onos_ctrl.activate()
                  assert_equal(status, True)
                  time.sleep(2)

      @classmethod
      def install_app_table(cls):
            ##Uninstall the existing app if any
            OnosCtrl.uninstall_app(cls.table_app)
            time.sleep(2)
            log.info('Installing the multi table app %s for subscriber test' %(cls.table_app_file))
            OnosCtrl.install_app(cls.table_app_file)
            time.sleep(3)

      @classmethod
      def uninstall_app_table(cls):
            ##Uninstall the table app on class exit
            OnosCtrl.uninstall_app(cls.table_app)
            time.sleep(2)
            log.info('Installing back the cord igmp app %s for subscriber test on exit' %(cls.app_file))
            OnosCtrl.install_app(cls.app_file)

      @classmethod
      def start_onos(cls, network_cfg = None):
            if cls.onos_restartable is False:
                  log.info('ONOS restart is disabled. Skipping ONOS restart')
                  return
            if network_cfg is None:
                  network_cfg = cls.device_dict

            if type(network_cfg) is tuple:
                  res = []
                  for v in network_cfg:
                        res += v.items()
                  config = dict(res)
            else:
                  config = network_cfg
            log.info('Restarting ONOS with new network configuration')
            return cord_test_onos_restart(config = config)

      @classmethod
      def remove_onos_config(cls):
            try:
                  os.unlink('{}/network-cfg.json'.format(cls.onos_config_path))
            except: pass

      @classmethod
      def start_cpqd(cls, mac = '00:11:22:33:44:55'):
            dpid = mac.replace(':', '')
            cpqd_file = os.sep.join( (cls.cpqd_path, 'cpqd.sh') )
            cpqd_cmd = '{} {}'.format(cpqd_file, dpid)
            ret = os.system(cpqd_cmd)
            assert_equal(ret, 0)
            time.sleep(10)
            device_id = 'of:{}{}'.format('0'*4, dpid)
            return device_id

      @classmethod
      def start_ovs(cls):
            ovs_file = os.sep.join( (cls.ovs_path, 'of-bridge.sh') )
            ret = os.system(ovs_file)
            assert_equal(ret, 0)
            time.sleep(30)

      def onos_aaa_load(self):
            if self.aaa_loaded:
                  return
            aaa_dict = {'apps' : { 'org.onosproject.aaa' : { 'AAA' : { 'radiusSecret': 'radius_password',
                                                                       'radiusIp': '172.17.0.2' } } } }
            radius_ip = os.getenv('ONOS_AAA_IP') or '172.17.0.2'
            aaa_dict['apps']['org.onosproject.aaa']['AAA']['radiusIp'] = radius_ip
            self.onos_load_config('org.onosproject.aaa', aaa_dict)
            self.aaa_loaded = True

      def onos_dhcp_table_load(self, config = None):
          dhcp_dict = {'apps' : { 'org.onosproject.dhcp' : { 'dhcp' : copy.copy(self.dhcp_server_config) } } }
          dhcp_config = dhcp_dict['apps']['org.onosproject.dhcp']['dhcp']
          if config:
              for k in config.keys():
                  if dhcp_config.has_key(k):
                      dhcp_config[k] = config[k]
          self.onos_load_config('org.onosproject.dhcp', dhcp_dict)

      def onos_load_config(self, app, config):
          status, code = OnosCtrl.config(config)
          if status is False:
             log.info('JSON config request for app %s returned status %d' %(app, code))
             assert_equal(status, True)
          time.sleep(2)

      def dhcp_sndrcv(self, dhcp, update_seed = False):
            cip, sip = dhcp.discover(update_seed = update_seed)
            assert_not_equal(cip, None)
            assert_not_equal(sip, None)
            log.info('Got dhcp client IP %s from server %s for mac %s' %
                     (cip, sip, dhcp.get_mac(cip)[0]))
            return cip,sip

      def dhcp_request(self, subscriber, seed_ip = '10.10.10.1', update_seed = False):
            config = {'startip':'10.10.10.20', 'endip':'10.10.10.200',
                      'ip':'10.10.10.2', 'mac': "ca:fe:ca:fe:ca:fe",
                      'subnet': '255.255.255.0', 'broadcast':'10.10.10.255', 'router':'10.10.10.1'}
            self.onos_dhcp_table_load(config)
            dhcp = DHCPTest(seed_ip = seed_ip, iface = subscriber.iface)
            cip, sip = self.dhcp_sndrcv(dhcp, update_seed = update_seed)
            return cip, sip

      def recv_channel_cb(self, pkt):
            ##First verify that we have received the packet for the joined instance
            chan = self.subscriber.caddr(pkt[IP].dst)
            assert_equal(chan in self.subscriber.join_map.keys(), True)
            recv_time = monotonic.monotonic() * 1000000
            join_time = self.subscriber.join_map[chan][self.subscriber.STATS_JOIN].start
            delta = recv_time - join_time
            self.subscriber.join_rx_stats.update(packets=1, t = delta, usecs = True)
            self.subscriber.channel_update(chan, self.subscriber.STATS_RX, 1, t = delta)
            log.debug('Packet received in %.3f usecs for group %s after join' %(delta, pkt[IP].dst))
            self.test_status = True

      def traffic_verify(self, subscriber):
            if subscriber.has_service('TRAFFIC'):
                  url = 'http://www.google.com'
                  resp = requests.get(url)
                  self.test_status = resp.ok
                  if resp.ok == False:
                        log.info('Subscriber %s failed get from url %s with status code %d'
                                 %(subscriber.name, url, resp.status_code))
                  else:
                        log.info('GET request from %s succeeded for subscriber %s'
                                 %(url, subscriber.name))

      def tls_verify(self, subscriber):
            if subscriber.has_service('TLS'):
                  time.sleep(2)
                  tls = TLSAuthTest(intf = subscriber.rx_intf)
                  log.info('Running subscriber %s tls auth test' %subscriber.name)
                  tls.runTest()
                  self.test_status = True

      def dhcp_verify(self, subscriber):
            if subscriber.has_service('DHCP'):
                  cip, sip = self.dhcp_request(subscriber, update_seed = True)
                  log.info('Subscriber %s got client ip %s from server %s' %(subscriber.name, cip, sip))
                  subscriber.src_list = [cip]
                  self.test_status = True
            else:
                  subscriber.src_list = ['10.10.10.{}'.format(subscriber.rx_port)]
                  self.test_status = True

      def dhcp_jump_verify(self, subscriber):
            if subscriber.has_service('DHCP'):
                  cip, sip = self.dhcp_request(subscriber, seed_ip = '10.10.200.1')
                  log.info('Subscriber %s got client ip %s from server %s' %(subscriber.name, cip, sip))
                  subscriber.src_list = [cip]
                  self.test_status = True
            else:
                  subscriber.src_list = ['10.10.10.{}'.format(subscriber.rx_port)]
                  self.test_status = True

      def dhcp_next_verify(self, subscriber):
            if subscriber.has_service('DHCP'):
                  cip, sip = self.dhcp_request(subscriber, seed_ip = '10.10.150.1')
                  log.info('Subscriber %s got client ip %s from server %s' %(subscriber.name, cip, sip))
                  subscriber.src_list = [cip]
                  self.test_status = True
            else:
                  subscriber.src_list = ['10.10.10.{}'.format(subscriber.rx_port)]
                  self.test_status = True

      def igmp_verify(self, subscriber):
            chan = 0
            if subscriber.has_service('IGMP'):
                  ##We wait for all the subscribers to join before triggering leaves
                  if subscriber.rx_port > 1:
                        time.sleep(5)
                  subscriber.channel_join(chan, delay = 0)
                  self.num_joins += 1
                  while self.num_joins < self.num_subscribers:
                        time.sleep(5)
                  log.info('All subscribers have joined the channel')
                  for i in range(10):
                        subscriber.channel_receive(chan, cb = subscriber.recv_channel_cb, count = 10)
                        log.info('Leaving channel %d for subscriber %s' %(chan, subscriber.name))
                        subscriber.channel_leave(chan)
                        time.sleep(5)
                        log.info('Interface %s Join RX stats for subscriber %s, %s' %(subscriber.iface, subscriber.name,subscriber.join_rx_stats))
                        #Should not receive packets for this subscriber
                        self.recv_timeout = True
                        subscriber.recv_timeout = True
                        subscriber.channel_receive(chan, cb = subscriber.recv_channel_cb, count = 10)
                        subscriber.recv_timeout = False
                        self.recv_timeout = False
                        log.info('Joining channel %d for subscriber %s' %(chan, subscriber.name))
                        subscriber.channel_join(chan, delay = 0)
                  self.test_status = True

      def igmp_jump_verify(self, subscriber):
            if subscriber.has_service('IGMP'):
                  for i in xrange(subscriber.num):
                        log.info('Subscriber %s jumping channel' %subscriber.name)
                        chan = subscriber.channel_jump(delay=0)
                        subscriber.channel_receive(chan, cb = subscriber.recv_channel_cb, count = 1)
                        log.info('Verified receive for channel %d, subscriber %s' %(chan, subscriber.name))
                        time.sleep(3)
                  log.info('Interface %s Jump RX stats for subscriber %s, %s' %(subscriber.iface, subscriber.name, subscriber.join_rx_stats))
                  self.test_status = True

      def igmp_next_verify(self, subscriber):
            if subscriber.has_service('IGMP'):
                  for i in xrange(subscriber.num):
                        if i:
                              chan = subscriber.channel_join_next(delay=0)
                        else:
                              chan = subscriber.channel_join(i, delay=0)
                        log.info('Joined next channel %d for subscriber %s' %(chan, subscriber.name))
                        subscriber.channel_receive(chan, cb = subscriber.recv_channel_cb, count=1)
                        log.info('Verified receive for channel %d, subscriber %s' %(chan, subscriber.name))
                        time.sleep(3)
                  log.info('Interface %s Join Next RX stats for subscriber %s, %s' %(subscriber.iface, subscriber.name, subscriber.join_rx_stats))
                  self.test_status = True

      def generate_port_list(self, subscribers, channels):
            return self.port_list[:subscribers]

      def subscriber_load(self, create = True, num = 10, num_channels = 1, channel_start = 0, port_list = []):
            '''Load the subscriber from the database'''
            self.subscriber_db = SubscriberDB(create = create, services = self.test_services)
            if create is True:
                  self.subscriber_db.generate(num)
            self.subscriber_info = self.subscriber_db.read(num)
            self.subscriber_list = []
            if not port_list:
                  port_list = self.generate_port_list(num, num_channels)

            index = 0
            for info in self.subscriber_info:
                  self.subscriber_list.append(Subscriber(name=info['Name'],
                                                         service=info['Service'],
                                                         port_map = self.port_map,
                                                         num=num_channels,
                                                         channel_start = channel_start,
                                                         tx_port = port_list[index][0],
                                                         rx_port = port_list[index][1]))
                  if num_channels > 1:
                        channel_start += num_channels
                  index += 1

            #load the ssm list for all subscriber channels
            igmpChannel = IgmpChannel()
            ssm_groups = map(lambda sub: sub.channels, self.subscriber_list)
            ssm_list = reduce(lambda ssm1, ssm2: ssm1+ssm2, ssm_groups)
            igmpChannel.igmp_load_ssm_config(ssm_list)

      def subscriber_join_verify( self, num_subscribers = 10, num_channels = 1,
                                  channel_start = 0, cbs = None, port_list = []):
          self.test_status = False
          self.num_subscribers = num_subscribers
          self.subscriber_load(create = True, num = num_subscribers,
                               num_channels = num_channels, channel_start = channel_start, port_list = port_list)
          self.onos_aaa_load()
          self.thread_pool = ThreadPool(min(100, self.num_subscribers), queue_size=1, wait_timeout=1)
          chan_leave = False #for single channel, multiple subscribers
          if cbs is None:
                cbs = (self.tls_verify, self.dhcp_verify, self.igmp_verify, self.traffic_verify)
                chan_leave = True
          for subscriber in self.subscriber_list:
                subscriber.start()
                pool_object = subscriber_pool(subscriber, cbs)
                self.thread_pool.addTask(pool_object.pool_cb)
          self.thread_pool.cleanUpThreads()
          for subscriber in self.subscriber_list:
                subscriber.stop()
                if chan_leave is True:
                      subscriber.channel_leave(0)
          self.num_subscribers = 0
          return self.test_status

      def test_subscriber_join_recv(self):
          """Test subscriber join and receive for channel surfing"""
          self.num_subscribers = 5
          self.num_channels = 1
          test_status = True
          ##Run this test only if ONOS can be restarted as it incurs a network-cfg change
          if self.onos_restartable is True:
                test_status = self.subscriber_join_verify(num_subscribers = self.num_subscribers,
                                                          num_channels = self.num_channels,
                                                          port_list = self.generate_port_list(self.num_subscribers,
                                                                                              self.num_channels))
          assert_equal(test_status, True)

      def test_subscriber_join_jump(self):
          """Test subscriber join jump for channel surfing"""
          self.num_subscribers = 5
          self.num_channels = 10
          test_status = self.subscriber_join_verify(num_subscribers = self.num_subscribers,
                                                    num_channels = self.num_channels,
                                                    cbs = (self.tls_verify, self.dhcp_jump_verify,
                                                           self.igmp_jump_verify, self.traffic_verify),
                                                    port_list = self.generate_port_list(self.num_subscribers,
                                                                                        self.num_channels))
          assert_equal(test_status, True)

      def test_subscriber_join_next(self):
          """Test subscriber join next for channel surfing"""
          self.num_subscribers = 5
          self.num_channels = 10
          test_status = self.subscriber_join_verify(num_subscribers = self.num_subscribers,
                                                    num_channels = self.num_channels,
                                                    cbs = (self.tls_verify, self.dhcp_next_verify,
                                                           self.igmp_next_verify, self.traffic_verify),
                                                    port_list = self.generate_port_list(self.num_subscribers,
                                                                                        self.num_channels))
          assert_equal(test_status, True)

Example #52

0

Show file

File: storage_mailbox.py Project: ekohl/vdsm

class SPM_MailMonitor:

    log = logging.getLogger('Storage.MailBox.SpmMailMonitor')

    def registerMessageType(self, messageType, callback):
        self._messageTypes[messageType] = callback

    def unregisterMessageType(self, messageType):
        del self._messageTypes[messageType]

    def __init__(self, pool, maxHostID, monitorInterval=2):
        self._messageTypes = {}
        # Save arguments
        self._stop = False
        self._stopped = False
        self._poolID = str(pool.spUUID)
        self._spmStorageDir = pool.storage_repository
        tpSize = config.getfloat('irs', 'thread_pool_size') / 2
        waitTimeout = 3
        maxTasks = config.getfloat('irs', 'max_tasks')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        #  *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice versa *** #
        self._inbox = os.path.join(self._spmStorageDir, self._poolID, "mastersd", sd.DOMAIN_META_DATA, "inbox")
        if not os.path.exists(self._inbox):
            self.log.error("SPM_MailMonitor create failed - inbox %s does not exist" % repr(self._inbox))
            raise RuntimeError("SPM_MailMonitor create failed - inbox %s does not exist" % repr(self._inbox))
        self._outbox = os.path.join(self._spmStorageDir, self._poolID, "mastersd", sd.DOMAIN_META_DATA, "outbox")
        if not os.path.exists(self._outbox):
            self.log.error("SPM_MailMonitor create failed - outbox %s does not exist" % repr(self._outbox))
            raise RuntimeError("SPM_MailMonitor create failed - outbox %s does not exist" % repr(self._outbox))
        self._numHosts = int(maxHostID)
        self._outMailLen = MAILBOX_SIZE * self._numHosts
        self._monitorInterval = monitorInterval
        # TODO: add support for multiple paths (multiple mailboxes)
        self._outgoingMail = self._outMailLen * "\0"
        self._incomingMail = self._outgoingMail
        self._inCmd = ['dd',
                        'if=' + str(self._inbox),
                        'iflag=direct,fullblock',
                        'count=1'
                        ]
        self._outCmd = ['dd',
                        'of=' + str(self._outbox),
                        'oflag=direct',
                        'iflag=fullblock',
                        'conv=notrunc',
                        'count=1'
                        ]
        self._outLock = thread.allocate_lock()
        self._inLock = thread.allocate_lock()
        # Clear outgoing mail
        self.log.debug("SPM_MailMonitor - clearing outgoing mail, command is: %s", self._outCmd)
        cmd = self._outCmd + ['bs=' + str(self._outMailLen)]
        (rc, out, err) = misc.execCmd(cmd, sudo=False, data=self._outgoingMail)
        if rc:
            self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, dd failed")

        thread.start_new_thread(self.run, (self, ))
        self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)


    def stop(self):
        self._stop = True


    def isStopped(self):
        return self._stopped


    def getMaxHostID(self):
        return self._numHosts


    def setMaxHostID(self, newMaxId):
        self._inLock.acquire()
        self._outLock.acquire()
        diff = newMaxId - self._numHosts
        if diff > 0:
            delta = MAILBOX_SIZE * diff * "\0"
            self._outgoingMail += delta
            self._incomingMail += delta
        elif diff < 0:
            delta = MAILBOX_SIZE * diff
            self._outgoingMail = self._outgoingMail[:-delta]
            self._incomingMail = self._incomingMail[:-delta]
        self._numHosts=newMaxId
        self._outMailLen = MAILBOX_SIZE * self._numHosts
        self._outLock.release()
        self._inLock.release()


    def _validateMailbox(self, mailbox, mailboxIndex):
        chkStart = MAILBOX_SIZE-CHECKSUM_BYTES
        chk = misc.checksum(mailbox[0 : chkStart], CHECKSUM_BYTES)
        pChk = struct.pack('<l',chk) # Assumes CHECKSUM_BYTES equals 4!!!
        if pChk != mailbox[chkStart : chkStart+CHECKSUM_BYTES]:
            self.log.error("SPM_MailMonitor: mailbox %s checksum failed, not clearing mailbox, clearing newMail.", str(mailboxIndex))
            return False
        elif pChk == pZeroChecksum: return False  # Ignore messages of empty mailbox
        return True


    def _handleRequests(self, newMail):

        send = False

        # run through all messages and check if new messages have arrived (since last read)
        for host in range(0, self._numHosts):
            # Check mailbox checksum
            mailboxStart = host * MAILBOX_SIZE

            isMailboxValidated = False

            for i in range(0, MESSAGES_PER_MAILBOX):

                msgId = host * SLOTS_PER_MAILBOX + i
                msgStart = msgId * MESSAGE_SIZE

                # First byte of message is message version.  Check message version, if 0 then message is empty and can be skipped
                if newMail[msgStart] in ['\0', '0']: continue

                # Most mailboxes are probably empty so it costs less to check that all messages start with 0 than
                # to validate the mailbox, therefor this is done after we find a non empty message in mailbox
                if not isMailboxValidated:
                    if not self._validateMailbox(newMail[mailboxStart : mailboxStart + MAILBOX_SIZE], host):
                        #Cleaning invalid mbx in newMail
                        newMail = newMail[:mailboxStart] + EMPTYMAILBOX +  newMail[mailboxStart + MAILBOX_SIZE:]
                        break
                    self.log.debug("SPM_MailMonitor: Mailbox %s validated, checking mail", host)
                    isMailboxValidated = True


                newMsg = newMail[msgStart : msgStart+MESSAGE_SIZE]
                msgOffset = msgId * MESSAGE_SIZE
                if newMsg == CLEAN_MESSAGE:
                    # Should probably put a setter on outgoingMail which would take the lock
                    self._outLock.acquire()
                    try:
                        self._outgoingMail = self._outgoingMail[0:msgOffset] + CLEAN_MESSAGE + self._outgoingMail[msgOffset+MESSAGE_SIZE : self._outMailLen]
                    finally:
                        self._outLock.release()
                    send = True
                    continue

                # Message isn't empty, check if its new
                isMessageNew = False
                for j in range(msgStart, msgStart + MESSAGE_SIZE):
                    if newMail[j] != self._incomingMail[j]:
                        isMessageNew = True
                        break

                # If search exhausted, i.e. message hasn't changed since last read, it can be skipped
                if not isMessageNew: continue

                # We only get here if there is a novel request
                try:
                    msgType = newMail[msgStart+1 : msgStart+5]
                    if msgType in self._messageTypes:
                        # Use message class to process request according to message specific logic
                        id = str(uuid.uuid4())
                        self.log.debug("SPM_MailMonitor: processing request: %s" % repr(newMail[msgStart : msgStart+MESSAGE_SIZE]))
                        res = self.tp.queueTask(id, runTask,
                                (self._messageTypes[msgType], msgId,
                                newMail[msgStart : msgStart+MESSAGE_SIZE])
                        )
                        if not res:
                            raise Exception()
                    else:
                        self.log.error("SPM_MailMonitor: unknown message type encountered: %s", msgType)
                except RuntimeError, e:
                    self.log.error("SPM_MailMonitor: exception: %s caught while handling message: %s",
                                    str(e), newMail[msgStart:msgStart + MESSAGE_SIZE])
                except:
                    self.log.error("SPM_MailMonitor: exception caught while handling message: %s",
                            newMail[msgStart:msgStart + MESSAGE_SIZE],
                            exc_info=True)

Example #53

0

Show file

File: storage_mailbox.py Project: ekohl/vdsm

class HSM_MailMonitor(threading.Thread):
    log = logging.getLogger('Storage.MailBox.HsmMailMonitor')

    def __init__(self, inbox, outbox, hostID, queue, monitorInterval):
        # Save arguments
        tpSize = config.getfloat('irs', 'thread_pool_size') / 2
        waitTimeout = 3
        maxTasks = config.getfloat('irs', 'max_tasks')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        self._stop = False
        self._flush = False
        self._queue = queue
        self._activeMessages = {}
        self._monitorInterval = monitorInterval
        self._hostID = int(hostID)
        self._used_slots_array = [ 0 ] * MESSAGES_PER_MAILBOX
        self._outgoingMail = EMPTYMAILBOX
        self._incomingMail = EMPTYMAILBOX
        # TODO: add support for multiple paths (multiple mailboxes)
        self._spmStorageDir = config.get('irs', 'repository')
        self._inCmd = [ constants.EXT_DD,
                        'if=' + str(inbox),
                        'iflag=direct,fullblock',
                        'bs=' + str(BLOCK_SIZE),
                        'count=' + str(BLOCKS_PER_MAILBOX),
                        'skip=' + str(self._hostID*BLOCKS_PER_MAILBOX)
                        ]
        self._outCmd = [constants.EXT_DD,
                        'of=' + str(outbox),
                        'iflag=fullblock',
                        'oflag=direct',
                        'conv=notrunc',
                        'bs=' + str(BLOCK_SIZE),
                        'seek=' + str(self._hostID*BLOCKS_PER_MAILBOX)
                        ]
        self._init = False
        self._initMailbox() # Read initial mailbox state
        self._msgCounter = 0
        self._sendMail() # Clear outgoing mailbox
        threading.Thread.__init__(self)
        self.start()

    def _initMailbox(self):
        # Sync initial incoming mail state with storage view
        (rc, out, err) = misc.execCmd(self._inCmd, sudo=False, raw=True)
        if rc == 0:
            self._incomingMail = out
            self._init = True
        else:
            self.log.warning("HSM_MailboxMonitor - Could not initialize mailbox, will not accept requests until init succeeds")


    def immStop(self):
        self._stop = True


    def immFlush(self):
        self._flush = True


    def _handleResponses(self, newMsgs):
        rc = False

        for i in range(0, MESSAGES_PER_MAILBOX):
            # Skip checking non used slots
            if self._used_slots_array[i] == 0: continue

            # Skip empty return messages (messages with version 0)
            start = i*MESSAGE_SIZE

            # First byte of message is message version.
            # Check return message version, if 0 then message is empty
            if newMsgs[start] in ['\0', '0']: continue

            for j in range(start, start + MESSAGE_SIZE):
                if newMsgs[j] != self._incomingMail[j]: break

            # If search exhausted then message hasn't changed since last read and can be skipped
            if j == (start + MESSAGE_SIZE - 1): continue

            #
            # We only get here if there is a novel reply so we can remove the message from the active list
            #  and the outgoing mail and handle the reply
            #
            rc = True

            newMsg = newMsgs[start : start + MESSAGE_SIZE]

            if newMsg == CLEAN_MESSAGE:
                del self._activeMessages[i]
                self._used_slots_array[i] = 0
                self._msgCounter -= 1
                self._outgoingMail = self._outgoingMail[0 : start] + MESSAGE_SIZE * "\0" + self._outgoingMail[start + MESSAGE_SIZE : ]
                continue

            msg = self._activeMessages[i]
            self._activeMessages[i] = CLEAN_MESSAGE
            self._outgoingMail = self._outgoingMail[0 : start] + CLEAN_MESSAGE + self._outgoingMail[start + MESSAGE_SIZE : ]

            try:
                self.log.debug("HSM_MailboxMonitor(%s/%s) - Checking reply: %s", self._msgCounter, MESSAGES_PER_MAILBOX, repr(newMsg))
                msg.checkReply(newMsg)
                if msg.callback:
                    try:
                        id = str(uuid.uuid4())
                        if not self.tp.queueTask(id, runTask, (msg.callback, msg.volumeData)):
                            raise Exception()
                    except:
                        self.log.error("HSM_MailMonitor: exception caught while running msg callback, for message: %s, callback function: %s",
                                repr(msg.payload), msg.callback, exc_info=True)
            except RuntimeError, e:
                self.log.error("HSM_MailMonitor: exception: %s caught while checking reply for message: %s, reply: %s", \
                                str(e), repr(msg.payload), repr(newMsg))
            except:

Example #54

0

Show file

class Crawler(object):
    def __init__(self, args=Strategy()):
        self.url = args.url
        self.max_depth = args.max_depth  #指定网页深度
        self.max_count = args.max_count  #爬行最大数量
        self.concurrency = args.concurrency  #线程数
        self.timeout = args.timeout  #超时
        self.cookies = args.cookies  #cookies
        self.ssl_verify = args.ssl_verify  #ssl
        self.same_host = args.same_host  #是否只抓取相同host的链接
        self.same_domain = args.same_domain  #是否只抓取相同domain的链接

        self.currentDepth = 1  #标注初始爬虫深度，从1开始
        self.keyword = args.keyword  #指定关键词,使用console的默认编码来解码

        self.threadPool = ThreadPool(args.concurrency)  #线程池,指定线程数

        self.visitedHrefs = set()  #已访问的链接
        self.unvisitedHrefs = deque()  #待访问的链接
        self.unvisitedHrefs.append(args.url)  #添加首个待访问的链接
        self.isCrawling = False  #标记爬虫是否开始执行任务

        self.file = BASEDIR + '/cache/crawler/' + genFilename(
            self.url) + '.txt'
        # print self.file
        # print 'args.url=\t',args.url

        #################
        #此句有问题
        self.database = Database(args.dbFile)  #数据库
        # print 'hehe'

        self.lock = Lock()

    def start(self):
        # print '\nStart Crawling\n'
        if not self._isDatabaseAvaliable():
            # print 'Error: Unable to open database file.\n'
            pass
        else:
            pass
        if True:
            self.isCrawling = True
            self.threadPool.startThreads()
            while self.currentDepth <= self.max_depth and len(
                    self.visitedHrefs) <= self.max_count:
                #分配任务,线程池并发下载当前深度的所有页面（该操作不阻塞）
                self._assignCurrentDepthTasks()
                #等待当前线程池完成所有任务,当池内的所有任务完成时，即代表爬完了一个网页深度
                #self.threadPool.taskJoin()可代替以下操作，可无法Ctrl-C Interupt
                counter = 0
                while self.threadPool.getTaskLeft() and counter < 600:
                    # print '>>taskleft:\t',self.threadPool.getTaskLeft()
                    # print self.threadPool.taskQueue.qsize()
                    # print self.threadPool.resultQueue.qsize()
                    # print self.threadPool.running
                    time.sleep(1)
                    counter += 1
                # self.threadPool.taskJoin()

                # print 'Depth %d Finish. Totally visited %d links. \n' % (
                # 	self.currentDepth, len(self.visitedHrefs))
                # log.info('Depth %d Finish. Total visited Links: %d\n' % (
                # 	self.currentDepth, len(self.visitedHrefs)))
                self.currentDepth += 1
            self.stop()

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        # self.database.close()

    def saveAllHrefsToFile(self, nonehtml=True):
        try:
            cf = CrawlerFile(url=self.url)
            contentlist = []
            hrefs = [i for i in self.visitedHrefs
                     ] + [j for j in self.unvisitedHrefs]
            for href in hrefs:
                if href.endswith('.html') and nonehtml:
                    continue
                contentlist.append(href)
            cf.saveSection('Hrefs', contentlist, coverfile=True)
            # fp = open(self.file,'w')
            # fp.write('[Hrefs]'+os.linesep)
            # hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
            # rethrefs = []
            # print 'Totally ',len(hrefs), ' hrefs'
            # for href in hrefs:
            # 	if href.endswith('.html'):
            # 		continue
            # 	rethrefs.append(href)
            # 	fp.write(href + os.linesep)
            # 	print href
            # print 'Totally ',len(rethrefs), ' aviable hrefs'
            # fp.close()
        except:
            pass

    def _getCrawlerPaths(self, url):
        ''' '''
        try:
            paths = []
            baseulp = urlparse(url)

            cf = CrawlerFile(url=url)
            urls = cf.getSection('Hrefs')
            #print urls

            for eachline in urls:
                eachline = eachline.replace('\r', '')
                eachline = eachline.replace('\n', '')
                #print eachline
                eachulp = urlparse(eachline)
                if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
                    fullpath = eachulp.path
                    if fullpath.find('.') == -1 and fullpath.endswith(
                            '/') == False:
                        fullpath += '/'
                    pos = 0
                    while True:
                        # print 'fullpath=',fullpath
                        pos = fullpath.find('/', pos)
                        if pos == -1:
                            break
                        tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:
                                                                                        pos]
                        if tmppth.endswith('/'):
                            #tmppth = tmppth[:-1]
                            continue
                        if tmppth not in paths:
                            paths.append(tmppth)
                        pos += 1

            return paths
        except Exception, e:
            print 'Exception:\t', e
            return [url]

Example #55

0

Show file

class CheckFileService(UDPService):
    def __init__(self, ip, port, nodes, timeout, directory,
                 file_transfer_service):
        super(CheckFileService, self).__init__(name='CheckFile',
                                               ip=ip,
                                               port=port)
        self.nodes = nodes
        self.directory = directory
        self.client_socket.settimeout(timeout)
        self.threadpool = ThreadPool(2)
        self.file_transfer_service = file_transfer_service

    def start_service(self):
        self.threadpool.add_task(self.start_server)
        self.threadpool.wait_completion()

    def process_server_response(self, message, address):
        file_existance = str(self.check_file_existance(message))
        tcp_server_port = str(self.file_transfer_service.get_server_port())
        self.send_message(str_message=file_existance + ',' + tcp_server_port,
                          address=address,
                          socket=self.server_socket)

    def check_file_existance(self, file_path):
        return path.exists(self.directory + '/' + file_path)

    def get_nodes_response_times(self, file_request_message):
        nodes_responses = []
        for node in self.nodes:
            if node is not self.ip:
                address = (node, self.port)
                start_time = time()
                self.send_message(str_message=file_request_message,
                                  address=address,
                                  socket=self.client_socket)
                try:
                    response = self.get_message(self.client_socket)
                    nodes_responses.append({
                        "node": node,
                        "response": response,
                        "response_time": time() - start_time
                    })
                except timeout:
                    print(node + " node timed out")
                    pass
        return nodes_responses

    def process_get_file_request(self, file_path):
        file_request_message = file_path
        nodes_responses = self.get_nodes_response_times(file_request_message)
        nodes_has_file = []
        for node_response in nodes_responses:
            response_message = nodes_responses["response"]["message"].split(
                ',')
            node_has_file = response_message[0] == "True"
            if node_has_file:
                nodes_has_file.append({
                    "node":
                    node_response["node"],
                    "node_port":
                    response_message[1],
                    "response_time":
                    node_response["response_time"]
                })
        if nodes_has_file is not None:
            return min(nodes_has_file, key=lambda t: t["response_time"])
        else:
            return None

Example #56

0

Show file

class SPM_MailMonitor:

    log = logging.getLogger('storage.MailBox.SpmMailMonitor')

    def registerMessageType(self, messageType, callback):
        self._messageTypes[messageType] = callback

    def unregisterMessageType(self, messageType):
        del self._messageTypes[messageType]

    def __init__(self, pool, maxHostID, monitorInterval=2):
        self._messageTypes = {}
        # Save arguments
        self._stop = False
        self._stopped = False
        self._poolID = str(pool.spUUID)
        self._spmStorageDir = pool.storage_repository
        tpSize = config.getint('irs', 'thread_pool_size') / 2
        waitTimeout = 3
        maxTasks = config.getint('irs', 'max_tasks')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        #  *** IMPORTANT NOTE: The SPM's inbox is the HSMs' outbox and vice
        #                      versa *** #
        self._inbox = os.path.join(self._spmStorageDir, self._poolID,
                                   "mastersd", sd.DOMAIN_META_DATA, "inbox")
        if not os.path.exists(self._inbox):
            self.log.error("SPM_MailMonitor create failed - inbox %s does not "
                           "exist" % repr(self._inbox))
            raise RuntimeError("SPM_MailMonitor create failed - inbox %s does "
                               "not exist" % repr(self._inbox))
        self._outbox = os.path.join(self._spmStorageDir, self._poolID,
                                    "mastersd", sd.DOMAIN_META_DATA, "outbox")
        if not os.path.exists(self._outbox):
            self.log.error("SPM_MailMonitor create failed - outbox %s does "
                           "not exist" % repr(self._outbox))
            raise RuntimeError("SPM_MailMonitor create failed - outbox %s "
                               "does not exist" % repr(self._outbox))
        self._numHosts = int(maxHostID)
        self._outMailLen = MAILBOX_SIZE * self._numHosts
        self._monitorInterval = monitorInterval
        # TODO: add support for multiple paths (multiple mailboxes)
        self._outgoingMail = self._outMailLen * "\0"
        self._incomingMail = self._outgoingMail
        self._inCmd = [
            'dd', 'if=' + str(self._inbox), 'iflag=direct,fullblock', 'count=1'
        ]
        self._outCmd = [
            'dd', 'of=' + str(self._outbox), 'oflag=direct', 'iflag=fullblock',
            'conv=notrunc', 'count=1'
        ]
        self._outLock = threading.Lock()
        self._inLock = threading.Lock()
        # Clear outgoing mail
        self.log.debug(
            "SPM_MailMonitor - clearing outgoing mail, command is: "
            "%s", self._outCmd)
        cmd = self._outCmd + ['bs=' + str(self._outMailLen)]
        (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail)
        if rc:
            self.log.warning("SPM_MailMonitor couldn't clear outgoing mail, "
                             "dd failed")

        t = concurrent.thread(self.run,
                              name="mailbox/spm",
                              logger=self.log.name)
        t.start()
        self.log.debug('SPM_MailMonitor created for pool %s' % self._poolID)

    def stop(self):
        self._stop = True

    def isStopped(self):
        return self._stopped

    def getMaxHostID(self):
        return self._numHosts

    def setMaxHostID(self, newMaxId):
        with self._inLock:
            with self._outLock:
                diff = newMaxId - self._numHosts
                if diff > 0:
                    delta = MAILBOX_SIZE * diff * "\0"
                    self._outgoingMail += delta
                    self._incomingMail += delta
                elif diff < 0:
                    delta = MAILBOX_SIZE * diff
                    self._outgoingMail = self._outgoingMail[:-delta]
                    self._incomingMail = self._incomingMail[:-delta]
                self._numHosts = newMaxId
                self._outMailLen = MAILBOX_SIZE * self._numHosts

    def _validateMailbox(self, mailbox, mailboxIndex):
        chkStart = MAILBOX_SIZE - CHECKSUM_BYTES
        chk = misc.checksum(mailbox[0:chkStart], CHECKSUM_BYTES)
        pChk = struct.pack('<l', chk)  # Assumes CHECKSUM_BYTES equals 4!!!
        if pChk != mailbox[chkStart:chkStart + CHECKSUM_BYTES]:
            self.log.error(
                "SPM_MailMonitor: mailbox %s checksum failed, not "
                "clearing mailbox, clearing newMail.", str(mailboxIndex))
            return False
        elif pChk == pZeroChecksum:
            return False  # Ignore messages of empty mailbox
        return True

    def _handleRequests(self, newMail):

        send = False

        # run through all messages and check if new messages have arrived
        # (since last read)
        for host in range(0, self._numHosts):
            # Check mailbox checksum
            mailboxStart = host * MAILBOX_SIZE

            isMailboxValidated = False

            for i in range(0, MESSAGES_PER_MAILBOX):

                msgId = host * SLOTS_PER_MAILBOX + i
                msgStart = msgId * MESSAGE_SIZE

                # First byte of message is message version.  Check message
                # version, if 0 then message is empty and can be skipped
                if newMail[msgStart] in ['\0', '0']:
                    continue

                # Most mailboxes are probably empty so it costs less to check
                # that all messages start with 0 than to validate the mailbox,
                # therefor this is done after we find a non empty message in
                # mailbox
                if not isMailboxValidated:
                    if not self._validateMailbox(
                            newMail[mailboxStart:mailboxStart + MAILBOX_SIZE],
                            host):
                        # Cleaning invalid mbx in newMail
                        newMail = newMail[:mailboxStart] + EMPTYMAILBOX + \
                            newMail[mailboxStart + MAILBOX_SIZE:]
                        break
                    self.log.debug(
                        "SPM_MailMonitor: Mailbox %s validated, "
                        "checking mail", host)
                    isMailboxValidated = True

                newMsg = newMail[msgStart:msgStart + MESSAGE_SIZE]
                msgOffset = msgId * MESSAGE_SIZE
                if newMsg == CLEAN_MESSAGE:
                    # Should probably put a setter on outgoingMail which would
                    # take the lock
                    self._outLock.acquire()
                    try:
                        self._outgoingMail = \
                            self._outgoingMail[0:msgOffset] + CLEAN_MESSAGE + \
                            self._outgoingMail[msgOffset + MESSAGE_SIZE:
                                               self._outMailLen]
                    finally:
                        self._outLock.release()
                    send = True
                    continue

                # Message isn't empty, check if its new
                isMessageNew = False
                for j in range(msgStart, msgStart + MESSAGE_SIZE):
                    if newMail[j] != self._incomingMail[j]:
                        isMessageNew = True
                        break

                # If search exhausted, i.e. message hasn't changed since last
                # read, it can be skipped
                if not isMessageNew:
                    continue

                # We only get here if there is a novel request
                try:
                    msgType = newMail[msgStart + 1:msgStart + 5]
                    if msgType in self._messageTypes:
                        # Use message class to process request according to
                        # message specific logic
                        id = str(uuid.uuid4())
                        self.log.debug(
                            "SPM_MailMonitor: processing request: "
                            "%s" %
                            repr(newMail[msgStart:msgStart + MESSAGE_SIZE]))
                        res = self.tp.queueTask(
                            id, runTask,
                            (self._messageTypes[msgType], msgId,
                             newMail[msgStart:msgStart + MESSAGE_SIZE]))
                        if not res:
                            raise Exception()
                    else:
                        self.log.error(
                            "SPM_MailMonitor: unknown message type "
                            "encountered: %s", msgType)
                except RuntimeError as e:
                    self.log.error(
                        "SPM_MailMonitor: exception: %s caught "
                        "while handling message: %s", str(e),
                        newMail[msgStart:msgStart + MESSAGE_SIZE])
                except:
                    self.log.error(
                        "SPM_MailMonitor: exception caught while "
                        "handling message: %s",
                        newMail[msgStart:msgStart + MESSAGE_SIZE],
                        exc_info=True)

        self._incomingMail = newMail
        return send

    def _checkForMail(self):
        # Lock is acquired in order to make sure that neither _numHosts nor
        # incomingMail are changed during checkForMail
        self._inLock.acquire()
        try:
            # self.log.debug("SPM_MailMonitor -_checking for mail")
            cmd = self._inCmd + ['bs=' + str(self._outMailLen)]
            # self.log.debug("SPM_MailMonitor - reading incoming mail, "
            #               "command: " + str(cmd))
            (rc, in_mail, err) = misc.execCmd(cmd, raw=True)
            if rc:
                raise IOError(
                    errno.EIO, "_handleRequests._checkForMail - "
                    "Could not read mailbox: %s" % self._inbox)

            if (len(in_mail) != (self._outMailLen)):
                self.log.error(
                    'SPM_MailMonitor: _checkForMail - dd succeeded '
                    'but read %d bytes instead of %d, cannot check '
                    'mail.  Read mail contains: %s', len(in_mail),
                    self._outMailLen, repr(in_mail[:80]))
                raise RuntimeError("_handleRequests._checkForMail - Could not "
                                   "read mailbox")
            # self.log.debug("Parsing inbox content: %s", in_mail)
            if self._handleRequests(in_mail):
                self._outLock.acquire()
                try:
                    cmd = self._outCmd + ['bs=' + str(self._outMailLen)]
                    (rc, out, err) = _mboxExecCmd(cmd, data=self._outgoingMail)
                    if rc:
                        self.log.warning("SPM_MailMonitor couldn't write "
                                         "outgoing mail, dd failed")
                finally:
                    self._outLock.release()
        finally:
            self._inLock.release()

    def sendReply(self, msgID, msg):
        # Lock is acquired in order to make sure that neither _numHosts nor
        # outgoingMail are changed while used
        self._outLock.acquire()
        try:
            msgOffset = msgID * MESSAGE_SIZE
            self._outgoingMail = \
                self._outgoingMail[0:msgOffset] + msg.payload + \
                self._outgoingMail[msgOffset + MESSAGE_SIZE:self._outMailLen]
            mailboxOffset = (msgID / SLOTS_PER_MAILBOX) * MAILBOX_SIZE
            mailbox = self._outgoingMail[mailboxOffset:mailboxOffset +
                                         MAILBOX_SIZE]
            cmd = self._outCmd + [
                'bs=' + str(MAILBOX_SIZE),
                'seek=' + str(mailboxOffset / MAILBOX_SIZE)
            ]
            # self.log.debug("Running command: %s, for message id: %s",
            #               str(cmd), str(msgID))
            (rc, out, err) = _mboxExecCmd(cmd, data=mailbox)
            if rc:
                self.log.error("SPM_MailMonitor: sendReply - couldn't send "
                               "reply, dd failed")
        finally:
            self._outLock.release()

    def run(self):
        try:
            while not self._stop:
                try:
                    self._checkForMail()
                except:
                    self.log.error("Error checking for mail", exc_info=True)
                time.sleep(self._monitorInterval)
        finally:
            self._stopped = True
            self.tp.joinAll(waitForTasks=False)
            self.log.info("SPM_MailMonitor - Incoming mail monitoring thread "
                          "stopped")

Example #57

0

Show file

class HSM_MailMonitor(object):
    log = logging.getLogger('storage.MailBox.HsmMailMonitor')

    def __init__(self, inbox, outbox, hostID, queue, monitorInterval):
        # Save arguments
        tpSize = config.getint('irs', 'thread_pool_size') / 2
        waitTimeout = 3
        maxTasks = config.getint('irs', 'max_tasks')
        self.tp = ThreadPool(tpSize, waitTimeout, maxTasks)
        self._stop = False
        self._flush = False
        self._queue = queue
        self._activeMessages = {}
        self._monitorInterval = monitorInterval
        self._hostID = int(hostID)
        self._used_slots_array = [0] * MESSAGES_PER_MAILBOX
        self._outgoingMail = EMPTYMAILBOX
        self._incomingMail = EMPTYMAILBOX
        # TODO: add support for multiple paths (multiple mailboxes)
        self._spmStorageDir = config.get('irs', 'repository')
        self._inCmd = [
            constants.EXT_DD, 'if=' + str(inbox), 'iflag=direct,fullblock',
            'bs=' + str(BLOCK_SIZE), 'count=' + str(BLOCKS_PER_MAILBOX),
            'skip=' + str(self._hostID * BLOCKS_PER_MAILBOX)
        ]
        self._outCmd = [
            constants.EXT_DD, 'of=' + str(outbox), 'iflag=fullblock',
            'oflag=direct', 'conv=notrunc', 'bs=' + str(BLOCK_SIZE),
            'seek=' + str(self._hostID * BLOCKS_PER_MAILBOX)
        ]
        self._init = False
        self._initMailbox()  # Read initial mailbox state
        self._msgCounter = 0
        self._sendMail()  # Clear outgoing mailbox
        self._thread = concurrent.thread(self.run,
                                         name="mailbox/hsm",
                                         logger=self.log.name)
        self._thread.start()

    def _initMailbox(self):
        # Sync initial incoming mail state with storage view
        (rc, out, err) = _mboxExecCmd(self._inCmd, raw=True)
        if rc == 0:
            self._incomingMail = out
            self._init = True
        else:
            self.log.warning("HSM_MailboxMonitor - Could not initialize "
                             "mailbox, will not accept requests until init "
                             "succeeds")

    def immStop(self):
        self._stop = True

    def immFlush(self):
        self._flush = True

    def _handleResponses(self, newMsgs):
        rc = False

        for i in range(0, MESSAGES_PER_MAILBOX):
            # Skip checking non used slots
            if self._used_slots_array[i] == 0:
                continue

            # Skip empty return messages (messages with version 0)
            start = i * MESSAGE_SIZE

            # First byte of message is message version.
            # Check return message version, if 0 then message is empty
            if newMsgs[start] in ['\0', '0']:
                continue

            for j in range(start, start + MESSAGE_SIZE):
                if newMsgs[j] != self._incomingMail[j]:
                    break

            # If search exhausted then message hasn't changed since last read
            # and can be skipped
            if j == (start + MESSAGE_SIZE - 1):
                continue

            #
            # We only get here if there is a novel reply so we can remove the
            # message from the active list and the outgoing mail and handle the
            # reply
            #
            rc = True

            newMsg = newMsgs[start:start + MESSAGE_SIZE]

            if newMsg == CLEAN_MESSAGE:
                del self._activeMessages[i]
                self._used_slots_array[i] = 0
                self._msgCounter -= 1
                self._outgoingMail = self._outgoingMail[0:start] + \
                    MESSAGE_SIZE * "\0" + self._outgoingMail[start +
                                                             MESSAGE_SIZE:]
                continue

            msg = self._activeMessages[i]
            self._activeMessages[i] = CLEAN_MESSAGE
            self._outgoingMail = self._outgoingMail[0:start] + \
                CLEAN_MESSAGE + self._outgoingMail[start + MESSAGE_SIZE:]

            try:
                self.log.debug(
                    "HSM_MailboxMonitor(%s/%s) - Checking reply: "
                    "%s", self._msgCounter, MESSAGES_PER_MAILBOX, repr(newMsg))
                msg.checkReply(newMsg)
                if msg.callback:
                    try:
                        id = str(uuid.uuid4())
                        if not self.tp.queueTask(
                                id, runTask, (msg.callback, msg.volumeData)):
                            raise Exception()
                    except:
                        self.log.error(
                            "HSM_MailMonitor: exception caught "
                            "while running msg callback, for "
                            "message: %s, callback function: %s",
                            repr(msg.payload),
                            msg.callback,
                            exc_info=True)
            except RuntimeError as e:
                self.log.error(
                    "HSM_MailMonitor: exception: %s caught while "
                    "checking reply for message: %s, reply: %s", str(e),
                    repr(msg.payload), repr(newMsg))
            except:
                self.log.error(
                    "HSM_MailMonitor: exception caught while "
                    "checking reply from SPM, request was: %s "
                    "reply: %s",
                    repr(msg.payload),
                    repr(newMsg),
                    exc_info=True)
        # Finished processing incoming mail, now save mail to compare against
        # next batch
        self._incomingMail = newMsgs
        return rc

    def _checkForMail(self):
        # self.log.debug("HSM_MailMonitor - checking for mail")
        # self.log.debug("Running command: " + str(self._inCmd))
        (rc, in_mail, err) = misc.execCmd(self._inCmd, raw=True)
        if rc:
            raise RuntimeError("_handleResponses.Could not read mailbox - rc "
                               "%s" % rc)
        if (len(in_mail) != MAILBOX_SIZE):
            raise RuntimeError("_handleResponses.Could not read mailbox - len "
                               "%s != %s" % (len(in_mail), MAILBOX_SIZE))
        # self.log.debug("Parsing inbox content: %s", in_mail)
        return self._handleResponses(in_mail)

    def _sendMail(self):
        self.log.info("HSM_MailMonitor sending mail to SPM - " +
                      str(self._outCmd))
        chk = misc.checksum(
            self._outgoingMail[0:MAILBOX_SIZE - CHECKSUM_BYTES],
            CHECKSUM_BYTES)
        pChk = struct.pack('<l', chk)  # Assumes CHECKSUM_BYTES equals 4!!!
        self._outgoingMail = \
            self._outgoingMail[0:MAILBOX_SIZE - CHECKSUM_BYTES] + pChk
        _mboxExecCmd(self._outCmd, data=self._outgoingMail)

    def _handleMessage(self, message):
        # TODO: add support for multiple mailboxes
        freeSlot = False
        for i in range(0, MESSAGES_PER_MAILBOX):
            if self._used_slots_array[i] == 0:
                if not freeSlot:
                    freeSlot = i
                continue
            duplicate = True
            for j in range(0, MESSAGE_SIZE):
                if message[j] != self._activeMessages[i][j]:
                    duplicate = False
                    break
            if duplicate:
                self.log.debug("HSM_MailMonitor - ignoring duplicate message "
                               "%s" % (repr(message)))
                return
        if not freeSlot:
            raise RuntimeError("HSM_MailMonitor - Active messages list full, "
                               "cannot add new message")

        self._msgCounter += 1
        self._used_slots_array[freeSlot] = 1
        self._activeMessages[freeSlot] = message
        start = freeSlot * MESSAGE_SIZE
        end = start + MESSAGE_SIZE
        self._outgoingMail = self._outgoingMail[0:start] + message.payload + \
            self._outgoingMail[end:]
        self.log.debug(
            "HSM_MailMonitor - start: %s, end: %s, len: %s, "
            "message(%s/%s): %s" %
            (start, end, len(self._outgoingMail), self._msgCounter,
             MESSAGES_PER_MAILBOX, repr(self._outgoingMail[start:end])))

    def run(self):
        try:
            failures = 0

            # Do not start processing requests before incoming mailbox is
            # initialized
            while not self._init and not self._stop:
                try:
                    time.sleep(2)
                    self._initMailbox()  # Read initial mailbox state
                except:
                    pass

            while not self._stop:
                try:
                    message = None
                    sendMail = False
                    # If no message is pending, block_wait until a new message
                    # or stop command arrives
                    while not self._stop and not message and \
                            not self._activeMessages:
                        try:
                            # self.log.debug("No requests in queue, going to "
                            #               "sleep until new requests arrive")
                            # Check if a new message is waiting to be sent
                            message = self._queue.get(
                                block=True, timeout=self._monitorInterval)
                            self._handleMessage(message)
                            message = None
                            sendMail = True
                        except Queue.Empty:
                            pass

                    if self._stop:
                        break

                    # If pending messages available, check if there are new
                    # messages waiting in queue as well
                    empty = False
                    while (not empty) and \
                            (len(self._activeMessages) < MESSAGES_PER_MAILBOX):
                        # TODO: Remove single mailbox limitation
                        try:
                            message = self._queue.get(block=False)
                            self._handleMessage(message)
                            message = None
                            sendMail = True
                        except Queue.Empty:
                            empty = True

                    if self._flush:
                        self._flush = False
                        sendMail = True

                    try:
                        sendMail |= self._checkForMail()
                        failures = 0
                    except:
                        self.log.error(
                            "HSM_MailboxMonitor - Exception caught "
                            "while checking for mail",
                            exc_info=True)
                        failures += 1

                    if sendMail:
                        self._sendMail()

                    # If there are active messages waiting for SPM reply, wait
                    # a few seconds before performing another IO op
                    if self._activeMessages and not self._stop:
                        # If recurring failures then sleep for one minute
                        # before retrying
                        if (failures > 9):
                            time.sleep(60)
                        else:
                            time.sleep(self._monitorInterval)

                except:
                    self.log.error(
                        "HSM_MailboxMonitor - Incoming mail"
                        "monitoring thread caught exception; "
                        "will try to recover",
                        exc_info=True)
        finally:
            self.log.info("HSM_MailboxMonitor - Incoming mail monitoring "
                          "thread stopped, clearing outgoing mail")
            self._outgoingMail = EMPTYMAILBOX
            self._sendMail()  # Clear outgoing mailbox

Example #58

0

Show file

File: crawler.py Project: a3587556/Hammer

class Crawler(object):
	def __init__(self, args=Strategy()):
		self.url = args.url 				
		self.max_depth = args.max_depth  	#指定网页深度
		self.max_count = args.max_count		#爬行最大数量
		self.concurrency = args.concurrency	#线程数
		self.timeout = args.timeout			#超时
		self.cookies = args.cookies 		#cookies
		self.ssl_verify = args.ssl_verify 	#ssl
		self.same_host = args.same_host		#是否只抓取相同host的链接
		self.same_domain = args.same_domain	#是否只抓取相同domain的链接

		self.currentDepth = 1  				#标注初始爬虫深度，从1开始
		self.keyword = args.keyword		 	#指定关键词,使用console的默认编码来解码
		

		self.threadPool = ThreadPool(args.concurrency)  #线程池,指定线程数
		
		self.visitedHrefs = set()   		#已访问的链接
		self.unvisitedHrefs = deque()		#待访问的链接 
		self.unvisitedHrefs.append(args.url)#添加首个待访问的链接
		self.isCrawling = False				#标记爬虫是否开始执行任务

		self.file = BASEDIR + '/cache/crawler/' + genFilename(self.url) + '.txt'
		print self.file
		print 'args.url=\t',args.url

		#################
		#此句有问题
		self.database =  Database(args.dbFile)			#数据库
		# print 'hehe'

		self.lock = Lock()

	def start(self):
		print '\nStart Crawling\n'
		if not self._isDatabaseAvaliable():
			print 'Error: Unable to open database file.\n'
		else:
			pass
		if True:
			self.isCrawling = True
			self.threadPool.startThreads() 
			while self.currentDepth <= self.max_depth and len(self.visitedHrefs) <= self.max_count:
				#分配任务,线程池并发下载当前深度的所有页面（该操作不阻塞）
				self._assignCurrentDepthTasks ()
				#等待当前线程池完成所有任务,当池内的所有任务完成时，即代表爬完了一个网页深度
				#self.threadPool.taskJoin()可代替以下操作，可无法Ctrl-C Interupt
				counter = 0
				while self.threadPool.getTaskLeft() and counter < 600:
					# print '>>taskleft:\t',self.threadPool.getTaskLeft()
					# print self.threadPool.taskQueue.qsize()
					# print self.threadPool.resultQueue.qsize()
					# print self.threadPool.running
					time.sleep(1)
					counter += 1
				# self.threadPool.taskJoin()

				print 'Depth %d Finish. Totally visited %d links. \n' % (
					self.currentDepth, len(self.visitedHrefs))
				log.info('Depth %d Finish. Total visited Links: %d\n' % (
					self.currentDepth, len(self.visitedHrefs)))
				self.currentDepth += 1
			self.stop()

	def stop(self):
		self.isCrawling = False
		self.threadPool.stopThreads()
		# self.database.close()

	def saveAllHrefsToFile(self,nonehtml=True):
		try:
			cf = CrawlerFile(url=self.url)
			contentlist = []
			hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
			for href in hrefs:
				if href.endswith('.html') and nonehtml:
					continue
				contentlist.append(href)
			cf.saveSection('Hrefs',contentlist,coverfile=True)
			# fp = open(self.file,'w')
			# fp.write('[Hrefs]'+os.linesep)
			# hrefs = [i for i in self.visitedHrefs] + [j for j in self.unvisitedHrefs]
			# rethrefs = []
			# print 'Totally ',len(hrefs), ' hrefs'
			# for href in hrefs:
			# 	if href.endswith('.html'):
			# 		continue
			# 	rethrefs.append(href)
			# 	fp.write(href + os.linesep)
			# 	print href
			# print 'Totally ',len(rethrefs), ' aviable hrefs'
			# fp.close()
		except:
			pass

	def _getCrawlerPaths(self,url):
		''' '''
		try:
			paths = []
			baseulp = urlparse(url)

			cf = CrawlerFile(url=url)
			urls = cf.getSection('Hrefs')
			#print urls

			for eachline in urls:
				eachline = eachline.replace('\r','')
  				eachline = eachline.replace('\n','')
				#print eachline
				eachulp = urlparse(eachline)
				if baseulp.scheme == eachulp.scheme and baseulp.netloc == eachulp.netloc:
					fullpath = eachulp.path
					if fullpath.find('.') == -1 and fullpath.endswith('/') == False:
						fullpath += '/'
					pos = 0
					while True:
						pos = fullpath.find('/',pos)
						if pos == -1:
							break
						tmppth = eachulp.scheme + '://' + eachulp.netloc + eachulp.path[:pos]
						if tmppth.endswith('/'):
							#tmppth = tmppth[:-1]
							continue
						if tmppth not in paths:
							paths.append(tmppth)
						pos +=1

			return paths
		except Exception,e:
			print 'Exception:\t',e
			return [url]

Example #59

0

Show file

File: crawler.py Project: hitalex/crawler

class Crawler(object):

    def __init__(self, args, startURLs):
        #指定网页深度
        self.depth = args.depth  
        #标注初始爬虫深度，从1开始
        self.currentDepth = 1  
        #指定关键词,使用console的默认编码来解码
        #self.keyword = args.keyword.decode(getdefaultlocale()[1]) 
        #数据库
        self.database =  Database(args.dbFile)
        # store group ids to fils, using UTF-8
        self.groupfile = codecs.open("GroupID.txt", "w", "UTF-8")
        #线程池,指定线程数
        self.threadPool = ThreadPool(args.threadNum)  
        #已访问的小组id
        self.visitedGroups = set()   
        #待访问的小组id
        self.unvisitedGroups = deque()
        
        # 所有的Group信息
        self.groupInfo = []
        
        self.lock = Lock() #线程锁

        #标记爬虫是否开始执行任务
        self.isCrawling = False
        # 添加尚未访问的小组首页
        for url in startURLs:
            match_obj = REGroup.match(url)
            print "Add start urls:", url
            assert(match_obj != None)
            self.unvisitedGroups.append(match_obj.group(1))
        
        # 一分钟内允许的最大访问次数
        self.MAX_VISITS_PER_MINUTE = 10
        # 当前周期内已经访问的网页数量
        self.currentPeriodVisits = 0
        # 将一分钟当作一个访问周期，记录当前周期的开始时间
        self.periodStart = time.time() # 使用当前时间初始化

    def start(self):
        print '\nStart Crawling\n'
        if not self._isDatabaseAvaliable():
            print 'Error: Unable to open database file.\n'
        else:
            self.isCrawling = True
            self.threadPool.startThreads() 
            self.periodStart = time.time() # 当前周期开始
            # 按照depth来抓取网页
            while self.currentDepth < self.depth+1:
                #分配任务,线程池并发下载当前深度的所有页面（该操作不阻塞）
                self._assignCurrentDepthTasks ()
                #等待当前线程池完成所有任务,当池内的所有任务完成时，即代表爬完了一个网页深度
                #self.threadPool.taskJoin()可代替以下操作，可无法Ctrl-C Interupt
                while self.threadPool.getTaskLeft() > 0:
                    print "Task left: ", self.threadPool.getTaskLeft()
                    time.sleep(3)
                print 'Depth %d Finish. Totally visited %d links. \n' % (
                    self.currentDepth, len(self.visitedGroups))
                log.info('Depth %d Finish. Total visited Links: %d\n' % (
                    self.currentDepth, len(self.visitedGroups)))
                self.currentDepth += 1
            self.stop()
            assert(self.threadPool.getTaskLeft() == 0)
            print "Main Crawling procedure finished!"

    def stop(self):
        self.isCrawling = False
        self.threadPool.stopThreads()
        # save group ids to file
        for group_id in self.visitedGroups:
            self.groupfile.write(group_id + "\n")
        self.groupfile.close()
        self.database.close()

    def getAlreadyVisitedNum(self):
        #visitedGroups保存已经分配给taskQueue的链接，有可能链接还在处理中。
        #因此真实的已访问链接数为visitedGroups数减去待访问的链接数
        if len(self.visitedGroups) == 0:
            return 0
        else:
            return len(self.visitedGroups) - self.threadPool.getTaskLeft()

    def _assignCurrentDepthTasks(self):
        """取出一个线程，并为这个线程分配任务，即抓取网页，并进行相应的访问控制
        """
        # 判断当前周期内访问的网页数目是否大于最大数目
        if self.currentPeriodVisits > self.MAX_VISITS_PER_MINUTE - 1:
            # 等待所有的网页处理完毕
            while self.threadPool.getTaskLeft() > 0:
                print "Waiting period ends..."
                time.sleep(1)
            timeNow = time.time()
            seconds = timeNow - self.periodStart
            if  seconds < 60: # 如果当前还没有过一分钟,则sleep
                time.sleep(int(seconds + 3))
            self.periodStart = time.time() # 重新设置开始时间
            self.currentPeriodVisits = 0
        # 从未访问的列表中抽出，并为其分配thread
        while len(self.unvisitedGroups) > 0:
            group_id = self.unvisitedGroups.popleft()
            #向任务队列分配任务
            url = "http://www.douban.com/group/" + group_id + "/"
            self.threadPool.putTask(self._taskHandler, url)
            # 添加已经访问过的小组id
            self.visitedGroups.add(group_id)
            
    def _taskHandler(self, url):
        """ 根据指定的url，抓取网页
        """
        print "Visiting : " + url
        webPage = WebPage(url)
        # 抓取页面内容
        flag = webPage.fetch()
        if flag:
            self.lock.acquire() #锁住该变量,保证操作的原子性
            self.currentPeriodVisits += 1
            self.lock.release()
            
            self._saveTaskResults(webPage)
            self._addUnvisitedGroups(webPage)
            return True
            
        # if page reading fails
        return False

    def _saveTaskResults(self, webPage):
        """将小组信息写入数据库
        """
        url, pageSource = webPage.getDatas()
        # 产生一个group对象
        dbgroup = Group(url, pageSource)
        # 写入数据库
        self.database.saveGroupInfo(dbgroup)
        
    def _addUnvisitedGroups(self, webPage):
        '''添加未访问的链接，并过滤掉非小组主页的链接。将有效的url放进UnvisitedGroups列表'''
        #对链接进行过滤:1.只获取http或https网页;2.保证每个链接只访问一次
        url, pageSource = webPage.getDatas()
        hrefs = self._getAllHrefsFromPage(url, pageSource)
        for href in hrefs:
            #print "URLs in page: ", href
            match_obj = REGroup.match(href)
            # 只有满足小组主页链接格式的链接才会被处理
            if self._isHttpOrHttpsProtocol(href) and (match_obj is not None):
                #pdb.set_trace()
                group_id = match_obj.group(1)
                #print "Group link: " + href
                if not self._isGroupRepeated(group_id):
                    # 将小组id放入待访问的小组列表中去
                    print "Add group id:", group_id
                    self.unvisitedGroups.append(group_id)

    def _getAllHrefsFromPage(self, url, pageSource):
        '''解析html源码，获取页面所有链接。返回链接列表'''
        hrefs = []
        soup = BeautifulSoup(pageSource)
        results = soup.find_all('a',href=True)
        for a in results:
            #必须将链接encode为utf8, 因为中文文件链接如 http://aa.com/文件.pdf 
            #在bs4中不会被自动url编码，从而导致encodeException
            href = a.get('href').encode('utf8')
            if not href.startswith('http'):
                href = urljoin(url, href)#处理相对链接的问题
            hrefs.append(href)
        return hrefs

    def _isHttpOrHttpsProtocol(self, href):
        protocal = urlparse(href).scheme
        if protocal == 'http' or protocal == 'https':
            return True
        return False

    def _isGroupRepeated(self, group_id):
        if (group_id in self.visitedGroups) or (group_id in self.unvisitedGroups):
            return True
        return False

    def _isDatabaseAvaliable(self):
        if self.database.isConn():
            return True
        return False

    def selfTesting(self, args):
        url = 'http://www.douban.com/group/insidestory/'
        print '\nVisiting http://www.douban.com/group/insidestory/'
        #测试网络,能否顺利获取百度源码
        pageSource = WebPage(url).fetch()
        if pageSource == None:
            print 'Please check your network and make sure it\'s connected.\n'
        #数据库测试
        elif not self._isDatabaseAvaliable():
            print 'Please make sure you have the permission to save data: %s\n' % args.dbFile
        #保存数据
        else:
            #self._saveTaskResults(url, pageSource)
            print 'Create logfile and database Successfully.'
            print 'Already save Baidu.com, Please check the database record.'
            print 'Seems No Problem!\n'