Esempio n. 1
0
    def do_collector_thread(self):
        logger.log('COLLECTOR thread launched', part='check')
        cur_launchs = {}
        while not stopper.interrupted:
            now = int(time.time())
            for (colname, e) in self.collectors.iteritems():
                colname = e['name']
                inst    = e['inst']
                # maybe a collection is already running
                if colname in cur_launchs:
                    continue
                if now >= e['next_check']:
                    logger.debug('COLLECTOR: launching collector %s' % colname, part='check')
                    t = threader.create_and_launch(inst.main, name='collector-%s' % colname)
                    cur_launchs[colname] = t
                    e['next_check'] += 10
                    e['last_check'] = now

            to_del = []
            for (colname, t) in cur_launchs.iteritems():
                if not t.is_alive():
                    t.join()
                    to_del.append(colname)
            for colname in to_del:
                del cur_launchs[colname]

            time.sleep(1)
Esempio n. 2
0
 def set_node_leave(nname):
     node = None
     with self.nodes_lock:
         for n in self.nodes.values():
             if n['name'] == nname:
                 node = n
     if node is None:
         return abort(404, 'This node is not found')
     logger.log('PUTTING LEAVE the node %s' % n, part='http')
     self.set_leave(node)
     return
Esempio n. 3
0
    def drop_db(self, h):
        # now demove the database
        with self.lock:
            try:
                del self.dbs[h]
            except IndexError: # if not there, not a problem...
                pass

        # And remove the files of this database
        p = os.path.join(self.ttldb_dir, '%d' % h)
        logger.log("Deleting ttl database tree", p, part='kv')
        shutil.rmtree(p, ignore_errors=True)
Esempio n. 4
0
 def do_detector_thread(self):
     logger.log("DETECTOR thread launched", part="detector")
     while not self.clust.interrupted:
         for (gname, gen) in self.clust.detectors.iteritems():
             logger.debug("LOOK AT DETECTOR", gen)
             interval = int(gen["interval"].split("s")[0])  # todo manage like it should
             should_be_launch = gen["last_launch"] < int(time.time()) - interval
             if should_be_launch:
                 print "LAUNCHING DETECTOR", gen
                 gen["last_launch"] = int(time.time())
                 do_apply = evaluater.eval_expr(gen["apply_if"])
                 print "DO APPLY?", do_apply
                 if do_apply:
                     tags = gen["tags"]
                     for tag in tags:
                         if tag not in self.clust.tags:
                             print "ADDING NEW TAGS", tag
         time.sleep(1)
Esempio n. 5
0
    def set_dead(self, suspect):
        addr = suspect['addr']
        port = suspect['port']
        name = suspect['name']
        incarnation = suspect['incarnation']
        uuid = suspect['uuid']
        tags = suspect.get('tags', [])
        services = suspect.get('services', {})                
        state = 'dead'
        
        # Maybe we didn't even have this nodes in our list?
        if not uuid in self.nodes:
            return
        
        node = self.nodes.get(uuid, None)
        # The node can vanish
        if node is None:
            return

        # Maybe this data is too old
        if incarnation < node['incarnation']:
            return

        # We only case about into about alive nodes, dead and suspect
        # are not interesting :)
        if node['state'] != 'alive':
            return
        
        # Maybe it's us?? We need to say F*****G NO, I'm alive!!
        if uuid == self.uuid:
            logger.log('SUSPECT: SOMEONE THINK I AM SUSPECT, BUT I AM ALIVE', part='gossip')
            self.incarnation += 1
            node['incarnation'] = self.incarnation
            self.stack_alive_broadcast(node)
            return
        
        logger.log('DEAD: I put in dead node %s' % node['name'], part='gossip')
        # Ok it's definitivly someone else that is now suspected, update this, and update it :)
        node['incarnation'] = incarnation
        node['state'] = state
        node['suspect_time'] = int(time.time())
        node['tags'] = tags
        node['services'] = services
        self.stack_dead_broadcast(node)
Esempio n. 6
0
    def set_alive(self, node, bootstrap=False, strong=False):
        addr = node['addr']
        port = node['port']
        name = node['name']
        incarnation = node['incarnation']
        uuid = node['uuid']
        state = node['state'] = 'alive'
        tags = node.get('tags', [])

        # Maybe it's me? if so skip it
        if not bootstrap:
            if node['addr'] == self.addr and node['port'] == self.port:
                return
        
        # Maybe it's a new node that just enter the cluster?
        if uuid not in self.nodes:
            logger.log("New node detected", node, part='gossip')
            # Add the node but in a protected mode
            with self.nodes_lock:
                self.nodes[uuid] = node
            self.stack_alive_broadcast(node)
            return
            
        prev = self.nodes.get(uuid, None)
        # maybe the prev was out by another thread?
        if prev is None:
            return
        change = (prev['state'] != state)
        
        # If the data is not just new, bail out
        if not strong and incarnation <= prev['incarnation']:
            return

        logger.debug('ALIVENODE', name, prev['state'], state, strong, change, incarnation, prev['incarnation'], (strong and change), (incarnation > prev['incarnation']))
        # only react to the new data if they are really new :)
        if strong or incarnation > prev['incarnation']:
            # protect the nodes access with the lock so others threads are happy :)
            with self.nodes_lock:
                self.nodes[uuid] = node
            # Only broadcast if it's a new data from somewhere else
            if (strong and change) or incarnation > prev['incarnation']:
                logger.debug("Updating alive a node", prev, 'with', node)
                self.stack_alive_broadcast(node)
Esempio n. 7
0
 def join(self):
     logger.log("We will try to join our seeds members", self.seeds, part='gossip')
     tmp = self.seeds
     others = []
     if not len(self.seeds):
         logger.log("No seeds nodes, I'm a bootstrap node?")
         return
     
     for e in tmp:
         elts = e.split(':')
         addr = elts[0]
         port = self.port
         if len(elts) > 1:
             port = int(elts[1])
         others.append( (addr, port) )
     random.shuffle(others)
     while True:
         logger.log('JOINING myself %s is joining %s nodes' % (self.name, others), part='gossip')
         nb = 0
         for other in others:
             nb += 1
             r = self.do_push_pull(other)
             
             # Do not merge with more than KGOSSIP distant nodes
             if nb > KGOSSIP:
                 continue
         # If we got enough nodes, we exit
         if len(self.nodes) != 1 or self.interrupted or self.bootstrap:
             return
         # Do not hummer the cpu....
         time.sleep(0.1)
Esempio n. 8
0
    def do_ping(self, other):
        ping_payload = {'type':'ping', 'seqno':0, 'node': other['name'], 'from': self.uuid}
        message = json.dumps(ping_payload)
        enc_message = encrypter.encrypt(message)
        addr = other['addr']
        port = other['port']
        _t = time.time()
        try:
            sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # UDP
            sock.sendto(enc_message, (addr, port) )
            logger.debug('PING waiting %s ack message' % other['name'], part='gossip')
            # Allow 3s to get an answer
            sock.settimeout(3)
            ret = sock.recv(65535)
            logger.debug('PING got a return from %s' %  other['name'], len(ret), part='gossip')
            # An aswer? great it is alive!
            self.set_alive(other, strong=True)
        except (socket.timeout, socket.gaierror), exp:
            logger.debug("PING: error joining the other node %s:%s : %s" % (addr, port, exp), part='gossip')
            logger.debug("PING: go indirect mode", part='gossip')
            possible_relays = []
            with self.nodes_lock:
                possible_relays = [n for n in self.nodes.values() if n['uuid'] != self.uuid and n != other and n['state'] == 'alive']

            if len(possible_relays) == 0:
                logger.log("PING: no possible relays for ping", part='gossip')
                self.set_suspect(other)
            # Take at least 3 relays to ask ping
            relays = random.sample(possible_relays, min(len(possible_relays), 3))
            logger.debug('POSSIBLE RELAYS', relays)
            ping_relay_payload = {'type':'ping-relay', 'seqno':0, 'tgt': other['uuid'], 'from': self.uuid}
            message = json.dumps(ping_relay_payload)
            enc_message = encrypter.encrypt(message)
            sock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # UDP
            for r in relays:
                try:
                    sock.sendto(enc_message, (r['addr'], r['port']) )
                    logger.debug('PING waiting ack message', part='gossip')
                except socket.error, exp:
                    logger.error('Cannot send a ping relay to %s:%s' % (r['addr'], r['port']), part='gossip')
Esempio n. 9
0
    def look_at_deads(self):
        # suspect a node for 5 * log(n+1) * interval
        node_scale = math.ceil(math.log10(float(len(self.nodes) + 1)))
        probe_interval = 1
        suspicion_mult = 5
        suspect_timeout = suspicion_mult * node_scale * probe_interval
        leave_timeout = suspect_timeout * 3 # something like 30s
        
        #print "SUSPECT timeout", suspect_timeout
        now = int(time.time())
        nodes = {}
        with self.nodes_lock:
            for node in self.nodes.values():
                # Only look at suspect nodes of course...
                if node['state'] != 'suspect':
                    continue
                stime = node.get('suspect_time', now)
                if stime < (now - suspect_timeout):
                    logger.log("SUSPECT: NODE", node['name'], node['incarnation'], node['state'], "is NOW DEAD", part='gossip')
                    node['state'] = 'dead'
                    self.stack_dead_broadcast(node)

        # Now for leave nodes, this time we will really remove the entry from our nodes
        to_del = []
        for (uuid, node) in nodes.iteritems():
            # Only look at suspect nodes of course...
            if node['state'] != 'leave':
                continue
            ltime = node.get('leave_time', now)
            print "LEAVE TIME", node['name'], ltime, now - leave_timeout, (now - leave_timeout) - ltime
            if ltime < (now - leave_timeout):
                logger.log("LEAVE: NODE", node['name'], node['incarnation'], node['state'], "is now definitivly leaved. We remove it from our nodes", part='gossip')
                to_del.append(uuid)
        # now really remove them from our list :)
        for uuid in to_del:
            try:
                del self.nodes[uuid]
            except IndexError: # not here? it was was we want
                pass
Esempio n. 10
0
    def clean_old(self):
        logger.debug("TTL clean old", part='kv')
        now = NOW.now + 3600
        h = divmod(now, 3600)[0]*3600
        # Look at the databses directory that have the hour time set
        subdirs = os.listdir(self.ttldb_dir)

        for d in subdirs:
            try:
                bhour = int(d)
            except ValueError: # who add a dir that is not a int here...
                continue
            # Is the hour available for cleaning?
            if bhour < h:
                logger.log("TTL bhour is too low!", bhour, part='kv')
                # take the database and dump all keys in it
                cdb = self.get_ttl_db(bhour)
                to_del = cdb.RangeIter()
                # Now ask the cluster to delete the key, whatever it is
                for (k,v) in to_del:
                    self.kv.delete(k)

                # now we clean all old entries, remove the idx database
                self.drop_db(bhour)
Esempio n. 11
0
 def bailout_after_leave(self):
     logger.log('Bailing out in few seconds. I was put in leave state')
     time.sleep(10)
     logger.log('Exiting from a self leave message')
     # Will set self.interrupted = True to eavery thread that loop                                
     pubsub.pub('interrupt')
Esempio n. 12
0
    def set_leave(self, leaved):
        addr = leaved['addr']
        port = leaved['port']
        name = leaved['name']
        incarnation = leaved['incarnation']
        uuid = leaved['uuid']
        tags = leaved.get('tags', [])
        services = leaved.get('services', {})
        state = 'leave'
        
        print "SET_LEAVE::", leaved
        
        # Maybe we didn't even have this nodes in our list?
        if not uuid in self.nodes:
            return
        
        node = self.nodes.get(uuid, None)
        # The node can vanish by another thread delete
        if node is None:
            return

        # Maybe we already know it's leaved, so don't update it
        if node['state'] == 'leave':
            return

        print "SET LEAVE %s and inner node %s" % (leaved, node)
        
        # If for me it must be with my own incarnation number so we are sure it's really us that should leave
        # and not 
        if uuid == self.uuid:
            if incarnation != node['incarnation']:
                print "LEAVE INCARNATION NOT THE SAME FOR MYSELF"
                return
        else:
            # If not for me, use the classic 'not already known' rule
            if incarnation < node['incarnation']:
                print "LEAVE, NOT FOR ME, THE INCARNATION NUMBER IS TOO OLD"
                return

        print "SET LEAVE UUID and SELF.UUID", uuid, self.uuid
        # Maybe it's us?? If so we must send our broadcast and exit in few seconds
        if uuid == self.uuid:
            logger.log('LEAVE: someone is asking me for leaving.', part='gossip')
            self.incarnation += 1
            node['incarnation'] = self.incarnation
            self.stack_leave_broadcast(node)
            def bailout_after_leave(self):
                logger.log('Bailing out in few seconds. I was put in leave state')
                time.sleep(10)
                logger.log('Exiting from a self leave message')
                # Will set self.interrupted = True to eavery thread that loop                                
                pubsub.pub('interrupt')
                
            threader.create_and_launch(bailout_after_leave, args=(self,))
            return

        logger.log('LEAVING: The node %s is leaving' % node['name'], part='gossip')
        # Ok it's definitivly someone else that is now suspected, update this, and update it :)
        node['incarnation'] = incarnation
        node['state'] = state
        node['leave_time'] = int(time.time())
        node['tags'] = tags
        node['services'] = services
        self.stack_leave_broadcast(node)
Esempio n. 13
0
            sock.settimeout(3*2)
            try:
                ret = sock.recv(65535)
            except socket.timeout:
                # still noone succed to ping it? I suspect it
                self.set_suspect(other)
                sock.close()
                return
            msg = json.loads(ret)
            sock.close()
            logger.debug('PING: got an answer from a relay', msg, part='gossip')
            logger.debug('RELAY set alive', other['name'], part='gossip')
            # Ok it's no more suspected, great :)
            self.set_alive(other, strong=True)
        except socket.error, exp:
            logger.log("PING: cannot join the other node %s:%s : %s" % (addr, port, exp), part='gossip')
        

    # Randomly push some gossip broadcast messages and send them to
    # KGOSSIP others nodes
    def do_gossip_push(self, dest):
        message = ''
        to_del = []
        stack = []
        tags = dest['tags']
        for b in broadcaster.broadcasts:
            # not a valid node for this message, skip it
            if 'tag' in b and b['tag'] not in tags:
                continue
            old_message = message
            send = b['send']
Esempio n. 14
0
 def log(self, *args):
    logger.log(*args)
Esempio n. 15
0
                self.cur_value = f.read()
                f.close()
            except IOError, exp:
                logger.error('Cannot open path file %s : %s' % (self.g['path'], exp))
                self.output = None
                self.template = ''
                self.buf = ''
                return False
        # If not exists or the value did change, regenerate it :)
        if not os.path.exists(self.g['path']) or self.cur_value != self.output:
            logger.debug('Generator %s generate a new value, writing it to %s' % (self.g['name'], self.g['path']))
            try:
                f = open(self.g['path'], 'w')
                f.write(self.output)
                f.close()
                logger.log('Generator %s did generate a new file at %s' % (self.g['name'], self.g['path']))
                return True
            except IOError, exp:
                logger.error('Cannot write path file %s : %s' % (self.g['path'], exp))
                self.output = None
                self.template = ''
                self.buf = ''
                return False

    # If need launch the restart command, shoul not block too long of
    # course
    def launch_command(self):
        cmd = self.g['command']
        try:
            p = subprocess.Popen(cmd, shell=True, stdout=subprocess.PIPE, stderr=subprocess.PIPE, close_fds=True, preexec_fn=os.setsid)
        except Exception, exp: