def register(self):
        # time.sleep(5)
        # 等待manager设置好watch
        info("register", 'check spider master node and bitmap status..')
        while self.zk.get("/spider/spiders/%s" % self.website())[0] != 'ready' \
                and self.zk.get("/spider/data/running/%s" % self.website())[0] == '':
            time.sleep(0.1)
            
        info("register", 'spider node and bitmap status is ok')
        
        time.sleep(0.2)
        self.node_state = SpiderNodeState(SpiderNodeState.INIT, self.website())
        self.data_state = SpiderDataState(SpiderDataState.INIT, self.website())
        
        self.zk_spider_node = self.zk.create('/spider/spiders/%s/sn_' % self.website(),
                                             ephemeral=True,
                                             sequence=True,
                                             value=self.node_state.dumps())
        self.data_state.spider_node_path = self.zk_spider_node
        
        self.zk_data_node = self.zk.create('/spider/data/running/%s/dn_' % self.website(), 
                                           sequence=True,
                                           value=self.data_state.dumps())  
        
        info("register", 'spider node and data node status is created')
   
        self.ready = False
        def data_alloc_ok(event):
            
            self.node_state =  SpiderNodeState.loads(self.zk.get(self.zk_spider_node)[0])
            self.node_state.data_node_path = self.zk_data_node
            self.node_state.state = SpiderNodeState.WORKING
            
            info("register", 'data block %s is alloced'%self.node_state.task_no)
            
            self.zk.set(self.zk_spider_node, value = self.node_state.dumps())
            
            self.data_state.state = SpiderNodeState.WORKING
            self.data_state.task_no = self.node_state.task_no
            print self.data_state.task_no
            self.zk.set(self.zk_data_node,value = self.data_state.dumps())
            self.ready = True
                        # now can start the spider
            # start()
        
        state = SpiderNodeState.loads( self.zk.get(self.zk_spider_node, watch=data_alloc_ok)[0])
        info("register", 'waiting for data being alloced')
        info("register","spider node state is %s"%state.state)
        if state.state == SpiderNodeState.READY:
            self.ready = True
        
        #print 'data' + str(self.zk.get('/spider/spiders/%s' % self.website()))

        # logging config
        logging.basicConfig(
            filename=os.path.join('./', '%s.log' % (self.name)), level=logging.INFO, filemode='w',
            format='%(asctime)s - %(levelname)s: %(message)s')
        while not self.ready:
            time.sleep(0.1)
 def spiders_website_watch(event):
     info('spiders_website_watch', 'spiders_website_watch was triggered:' + str(event))
     
     if event.type == EventType.DELETED:
         info('spiders_website_watch', 'spider website was deleted')
         return
     elif event.type == EventType.CHILD:
         childs = self.zk.get_children(event.path, watch=spiders_website_watch)
         for child in childs:
             child_path = event.path + '/' + child
             state = self.zk.get(child_path)
             node_state = SpiderNodeState.loads(state[0])
             if node_state.is_new_node():
                 info('spiders_website_watch', 'new spider joined:' + child_path)
                 # alloc data node for the new spider
                 self.alloc_task_no(child_path)
         
         info('spiders_website_watch', 'check if some spider exit')
         website = event.path.split('/')[-1]
         childs = self.zk.get_children('/spider/data/running/%s' % website)
         
         for child in childs:
             data_node_state = pickle.loads(self.zk.get('/spider/data/running/%s/%s' % (website, child))[0])
             if (not self.zk.exists(data_node_state.spider_node_path)) \
                     and (data_node_state.state != SpiderDataState.DONE):
                 info('spiders_website_watch', 'spider node %s is dead' % (data_node_state.spider_node_path))
                 
                 self.zk.create('/spider/data/error/%s/en_'%website,
                                sequence=True,
                                value = data_node_state.dumps())
                 self.zk.delete('/spider/data/running/%s/%s' % (website, child))
 def data_alloc_ok(event):
     
     self.node_state =  SpiderNodeState.loads(self.zk.get(self.zk_spider_node)[0])
     self.node_state.data_node_path = self.zk_data_node
     self.node_state.state = SpiderNodeState.WORKING
     
     info("register", 'data block %s is alloced'%self.node_state.task_no)
     
     self.zk.set(self.zk_spider_node, value = self.node_state.dumps())
     
     self.data_state.state = SpiderNodeState.WORKING
     self.data_state.task_no = self.node_state.task_no
     print self.data_state.task_no
     self.zk.set(self.zk_data_node,value = self.data_state.dumps())
     self.ready = True
Esempio n. 4
0
        def data_alloc_ok(event):

            self.node_state = SpiderNodeState.loads(
                self.zk.get(self.zk_spider_node)[0])
            self.node_state.data_node_path = self.zk_data_node
            self.node_state.state = SpiderNodeState.WORKING

            info("register",
                 'data block %s is alloced' % self.node_state.task_no)

            self.zk.set(self.zk_spider_node, value=self.node_state.dumps())

            self.data_state.state = SpiderNodeState.WORKING
            self.data_state.task_no = self.node_state.task_no
            print self.data_state.task_no
            self.zk.set(self.zk_data_node, value=self.data_state.dumps())
            self.ready = True
    def alloc_task_no(self, child_path):
        info('alloc_task_no', 'try to alloc task no for %s'%child_path)
        
        tx = self.zk.transaction()
        node_version =  self.zk.get(child_path)[1].version
        
        node_state = SpiderNodeState.loads(self.zk.get(child_path)[0])
        node_state.state = SpiderNodeState.READY
        node_state.task_no = -1
        
        website = child_path.split('/')[-2]
        
        #首先检查该网站下是否有错误节点
        childs = self.zk.get_children("/spider/data/error/%s" % website)
        if len(childs)>0:
            info('alloc_task_no','alloc error data node')
            error_node_state = SpiderDataState.loads(self.zk.get("/spider/data/error/%s/%s"%(website,childs[0]))[0])
            tx.delete("/spider/data/error/%s/%s"%(website,childs[0]))
            
            node_state.task_no =  error_node_state.task_no
        else:
            path = "/spider/data/running/%s" % website
            data = self.zk.get(path)
            bitmap = pickle.loads(data[0])
            
            for i in xrange(len(bitmap)):

                if not bitmap[i]:
                    bitmap[i] = 1
                    self.zk.set(path, pickle.dumps(bitmap))
                    node_state.task_no =  i;
                    break
        info('alloc_task_no', 'alloc task no %s for %s'%(node_state.task_no,child_path))
        
        tx.set_data(child_path, value=node_state.dumps())
        tx.commit()
Esempio n. 6
0
    def register(self):
        # time.sleep(5)
        # 等待manager设置好watch
        info("register", 'check spider master node and bitmap status..')
        while self.zk.get("/spider/spiders/%s" % self.website())[0] != 'ready' \
                and self.zk.get("/spider/data/running/%s" % self.website())[0] == '':
            time.sleep(0.1)

        info("register", 'spider node and bitmap status is ok')

        time.sleep(0.2)
        self.node_state = SpiderNodeState(SpiderNodeState.INIT, self.website())
        self.data_state = SpiderDataState(SpiderDataState.INIT, self.website())

        self.zk_spider_node = self.zk.create('/spider/spiders/%s/sn_' %
                                             self.website(),
                                             ephemeral=True,
                                             sequence=True,
                                             value=self.node_state.dumps())
        self.data_state.spider_node_path = self.zk_spider_node

        self.zk_data_node = self.zk.create('/spider/data/running/%s/dn_' %
                                           self.website(),
                                           sequence=True,
                                           value=self.data_state.dumps())

        info("register", 'spider node and data node status is created')

        self.ready = False

        def data_alloc_ok(event):

            self.node_state = SpiderNodeState.loads(
                self.zk.get(self.zk_spider_node)[0])
            self.node_state.data_node_path = self.zk_data_node
            self.node_state.state = SpiderNodeState.WORKING

            info("register",
                 'data block %s is alloced' % self.node_state.task_no)

            self.zk.set(self.zk_spider_node, value=self.node_state.dumps())

            self.data_state.state = SpiderNodeState.WORKING
            self.data_state.task_no = self.node_state.task_no
            print self.data_state.task_no
            self.zk.set(self.zk_data_node, value=self.data_state.dumps())
            self.ready = True
            # now can start the spider
            # start()

        state = SpiderNodeState.loads(
            self.zk.get(self.zk_spider_node, watch=data_alloc_ok)[0])
        info("register", 'waiting for data being alloced')
        info("register", "spider node state is %s" % state.state)
        if state.state == SpiderNodeState.READY:
            self.ready = True

        #print 'data' + str(self.zk.get('/spider/spiders/%s' % self.website()))

        # logging config
        logging.basicConfig(filename=os.path.join('./',
                                                  '%s.log' % (self.name)),
                            level=logging.INFO,
                            filemode='w',
                            format='%(asctime)s - %(levelname)s: %(message)s')
        while not self.ready:
            time.sleep(0.1)
Esempio n. 7
0
class AbstractSpider(Thread):
    def __init__(self, name='spider', workdir=None, store=None):
        info('__init__', 'start to init the spider..')
        Thread.__init__(self)

        if store == None and workdir == None:
            raise Exception('set store value or workdir to create spider')
        self.store = store if store != None else FileStore(workdir, name)

        self.name = name
        self.start_time = time.time()
        self.pause_seconds = 1
        self.urls = []
        self.zk = zookeeper()
        self.zk.start()
        self.zk.ensure_path("/spider/data/running/%s" % self.website())
        self.zk.ensure_path("/spider/spiders/%s" % self.website())
        #self.register()
        info('__init__', 'init ok.')

    def register(self):
        # time.sleep(5)
        # 等待manager设置好watch
        info("register", 'check spider master node and bitmap status..')
        while self.zk.get("/spider/spiders/%s" % self.website())[0] != 'ready' \
                and self.zk.get("/spider/data/running/%s" % self.website())[0] == '':
            time.sleep(0.1)

        info("register", 'spider node and bitmap status is ok')

        time.sleep(0.2)
        self.node_state = SpiderNodeState(SpiderNodeState.INIT, self.website())
        self.data_state = SpiderDataState(SpiderDataState.INIT, self.website())

        self.zk_spider_node = self.zk.create('/spider/spiders/%s/sn_' %
                                             self.website(),
                                             ephemeral=True,
                                             sequence=True,
                                             value=self.node_state.dumps())
        self.data_state.spider_node_path = self.zk_spider_node

        self.zk_data_node = self.zk.create('/spider/data/running/%s/dn_' %
                                           self.website(),
                                           sequence=True,
                                           value=self.data_state.dumps())

        info("register", 'spider node and data node status is created')

        self.ready = False

        def data_alloc_ok(event):

            self.node_state = SpiderNodeState.loads(
                self.zk.get(self.zk_spider_node)[0])
            self.node_state.data_node_path = self.zk_data_node
            self.node_state.state = SpiderNodeState.WORKING

            info("register",
                 'data block %s is alloced' % self.node_state.task_no)

            self.zk.set(self.zk_spider_node, value=self.node_state.dumps())

            self.data_state.state = SpiderNodeState.WORKING
            self.data_state.task_no = self.node_state.task_no
            print self.data_state.task_no
            self.zk.set(self.zk_data_node, value=self.data_state.dumps())
            self.ready = True
            # now can start the spider
            # start()

        state = SpiderNodeState.loads(
            self.zk.get(self.zk_spider_node, watch=data_alloc_ok)[0])
        info("register", 'waiting for data being alloced')
        info("register", "spider node state is %s" % state.state)
        if state.state == SpiderNodeState.READY:
            self.ready = True

        #print 'data' + str(self.zk.get('/spider/spiders/%s' % self.website()))

        # logging config
        logging.basicConfig(filename=os.path.join('./',
                                                  '%s.log' % (self.name)),
                            level=logging.INFO,
                            filemode='w',
                            format='%(asctime)s - %(levelname)s: %(message)s')
        while not self.ready:
            time.sleep(0.1)

    def unregister(self):
        self.data_state.state = SpiderDataState.DONE

        self.zk.ensure_path('/spider/data/completed/%s/' % self.website())
        tx = self.zk.transaction()
        tx.create('/spider/data/completed/%s/%s' %
                  (self.website(), self.zk_data_node.split('/')[-1]),
                  value=self.data_state.dumps())
        tx.delete(self.zk_data_node)
        tx.delete(self.zk_spider_node)
        tx.commit()

    def update_state(self):
        self.zk.set(self.zk_data_node, self.state.dumps())

    def info(self, msg, br=True):
        logging.info(msg)

    def pause(self):
        self.info('pause %s seconds' % self.pause_seconds, br=False)
        map(lambda i: (self.info('.', br=False) and time.sleep(1)),
            range(0, self.pause_seconds))
        self.info('resume')

    def save(self, url, data):
        self.store.save_data(url, data)

    def visit(self, url):
        return urllib2.urlopen(url).read()

    def run(self):
        while True:
            self.register()
            for url in self.urls(self.data_state.task_no):
                try:
                    print url
                    data = self.visit(url)
                    new_md5 = md5(data)
                    data_in_db = self.store.has(url)

                    if data_in_db:
                        self.info('has fetched before:%s,checking md5...' %
                                  url)
                        if new_md5 == data_in_db[0]:
                            self.info('md5 is the same,so skip.')
                        else:
                            self.info('md5 is change, so fetch it again:')
                            self.store.save_data(url, data)
                            self.store.success(url, new_md5, self.website())
                    else:
                        self.save(url, data)
                        self.store.success(url, md5(data), self.website())

                    self.pause()
                except Exception as e:
                    self.info(str(e))
                    try:
                        self.store.error(url)
                    except Exception as e:
                        self.info(str(e))
            self.unregister()

    def website(self):
        return "unknown"

    def stop(self):
        pass
class AbstractSpider(Thread):

    def __init__(self,
                 name='spider',
                 workdir=None,
                 store=None):
        info('__init__', 'start to init the spider..')
        Thread.__init__(self)
        
        if store == None and workdir == None:
            raise Exception('set store value or workdir to create spider')
        self.store = store if store != None else  FileStore(workdir, name)
        
        self.name = name
        self.start_time = time.time()
        self.pause_seconds = 1
        self.urls = []
        self.zk = zookeeper()
        self.zk.start()
        self.zk.ensure_path("/spider/data/running/%s" % self.website())
        self.zk.ensure_path("/spider/spiders/%s" % self.website())
        #self.register()
        info('__init__', 'init ok.')
        
    def register(self):
        # time.sleep(5)
        # 等待manager设置好watch
        info("register", 'check spider master node and bitmap status..')
        while self.zk.get("/spider/spiders/%s" % self.website())[0] != 'ready' \
                and self.zk.get("/spider/data/running/%s" % self.website())[0] == '':
            time.sleep(0.1)
            
        info("register", 'spider node and bitmap status is ok')
        
        time.sleep(0.2)
        self.node_state = SpiderNodeState(SpiderNodeState.INIT, self.website())
        self.data_state = SpiderDataState(SpiderDataState.INIT, self.website())
        
        self.zk_spider_node = self.zk.create('/spider/spiders/%s/sn_' % self.website(),
                                             ephemeral=True,
                                             sequence=True,
                                             value=self.node_state.dumps())
        self.data_state.spider_node_path = self.zk_spider_node
        
        self.zk_data_node = self.zk.create('/spider/data/running/%s/dn_' % self.website(), 
                                           sequence=True,
                                           value=self.data_state.dumps())  
        
        info("register", 'spider node and data node status is created')
   
        self.ready = False
        def data_alloc_ok(event):
            
            self.node_state =  SpiderNodeState.loads(self.zk.get(self.zk_spider_node)[0])
            self.node_state.data_node_path = self.zk_data_node
            self.node_state.state = SpiderNodeState.WORKING
            
            info("register", 'data block %s is alloced'%self.node_state.task_no)
            
            self.zk.set(self.zk_spider_node, value = self.node_state.dumps())
            
            self.data_state.state = SpiderNodeState.WORKING
            self.data_state.task_no = self.node_state.task_no
            print self.data_state.task_no
            self.zk.set(self.zk_data_node,value = self.data_state.dumps())
            self.ready = True
                        # now can start the spider
            # start()
        
        state = SpiderNodeState.loads( self.zk.get(self.zk_spider_node, watch=data_alloc_ok)[0])
        info("register", 'waiting for data being alloced')
        info("register","spider node state is %s"%state.state)
        if state.state == SpiderNodeState.READY:
            self.ready = True
        
        #print 'data' + str(self.zk.get('/spider/spiders/%s' % self.website()))

        # logging config
        logging.basicConfig(
            filename=os.path.join('./', '%s.log' % (self.name)), level=logging.INFO, filemode='w',
            format='%(asctime)s - %(levelname)s: %(message)s')
        while not self.ready:
            time.sleep(0.1)
            
    def unregister(self):
        self.data_state.state = SpiderDataState.DONE

        self.zk.ensure_path('/spider/data/completed/%s/'%self.website())
        tx = self.zk.transaction()
        tx.create('/spider/data/completed/%s/%s'%(self.website(),self.zk_data_node.split('/')[-1]),value = self.data_state.dumps())
        tx.delete(self.zk_data_node)
        tx.delete(self.zk_spider_node)
        tx.commit()
        
    def update_state(self):
        self.zk.set(self.zk_data_node, self.state.dumps())
        
    def info(self, msg, br=True):
        logging.info(msg)

    def pause(self):
        self.info('pause %s seconds' % self.pause_seconds, br=False)
        map(lambda i: (self.info('.', br=False) and time.sleep(1)),
            range(0, self.pause_seconds))
        self.info('resume')

    def save(self, url, data):
        self.store.save_data(url, data)

    def visit(self, url):
        return  urllib2.urlopen(url).read()
    
    def run(self):
        while True:
            self.register()
            for url in self.urls(self.data_state.task_no):
                try:
                    print url
                    data = self.visit(url)
                    new_md5 = md5(data)
                    data_in_db = self.store.has(url)
    
                    if data_in_db:
                        self.info('has fetched before:%s,checking md5...' % url)
                        if new_md5 == data_in_db[0]:
                            self.info('md5 is the same,so skip.')
                        else:
                            self.info('md5 is change, so fetch it again:')
                            self.store.save_data(url, data)
                            self.store.success(url, new_md5, self.website())
                    else:
                        self.save(url, data)
                        self.store.success(url, md5(data), self.website())
                        
                    self.pause()
                except Exception as e:
                    self.info(str(e))
                    try:
                        self.store.error(url)
                    except Exception as e:
                        self.info(str(e))
            self.unregister()

    def website(self):
        return "unknown"
    def stop(self):
        pass