Beispiel #1
0
    def __init__ (self, connectionPool, p_index, s_index, debug):
        self.debug = debug
        self.primary_index = p_index
        self.secondary_index = s_index
        self.pool = connectionPool
        
        try:
            self.registry = Registry(connectionPool, debug)
            self.num_servers = self.registry.get('hadoop.num_servers')
            
            if self.num_servers == None:
                self.num_servers = 1

            self.packers = {}
            
            for packer in self.primary_index.names():
                try:
                    package='DB.PrimaryIndex.PackUnpack'
                    self.L("loading packer " + package + "." + packer)
                    __import__(package + "." + packer)
                    pkg = sys.modules[package + "." + packer]
                    self.packers[packer] = getattr(pkg, packer)
                except ImportError as e:
                    self.L("warning: failed to load " + packer)
                  
            with self.pool.connection() as dbh:
                self.tbl_co = dbh.table('cif_objs')
                self.available_tables = dbh.tables()
            
        except Exception as e:
            self.L("failed to open tables")
            print e
            raise
Beispiel #2
0
class Log(object):
    def __init__ (self, connectionPool, myhost = None, debug = 0):
        self.debug = debug
        self.pool = connectionPool
        
        if myhost != None:
            self.myhost = myhost
        else:
            self.myhost = socket.gethostname()
        
        self.registry = Registry(connectionPool, debug)
        self.num_servers = self.registry.get('hadoop.num_servers')

        if self.num_servers == None:
            self.num_servers = 1
        
        self.salt = Salt(self.num_servers, self.debug)
    
    def L(self, msg):
         self.salt.next()
         try:
            rowkey = struct.pack(">HI", self.salt.next(), int(time.time()))
            rowdict =      {
                                'b:hostname': str(self.myhost),
                                'b:msg': str(msg)
                            }
            with self.pool.connection() as connection:
                connection.table('log').put(rowkey, rowdict)
         except Exception as e:
            print "failed to put record to 'log' table: ", rowdict

            
Beispiel #3
0
	def __init__ (self, connectionPool, debug=0):
		self.debug = debug
		self.pool = connectionPool
		self.registry = Registry(connectionPool, debug)
		self.index_to_enum = {}  # name -> enum
		self.enum_to_index = {}  # enum -> name
    	
		self.load_primary_index_map()
Beispiel #4
0
    def __init__ (self, connectionPool, debug=0):
        self.debug = debug
        self.pool = connectionPool

        self.registry = Registry(connectionPool, debug)
        self.names_list = []
        self.names_dict = {}
        
        self.load_secondary_index_map()
Beispiel #5
0
    def __init__ (self, connectionPool, myhost = None, debug = 0):
        self.debug = debug
        self.pool = connectionPool
        
        if myhost != None:
            self.myhost = myhost
        else:
            self.myhost = socket.gethostname()
        
        self.registry = Registry(connectionPool, debug)
        self.num_servers = self.registry.get('hadoop.num_servers')

        if self.num_servers == None:
            self.num_servers = 1
        
        self.salt = Salt(self.num_servers, self.debug)
Beispiel #6
0
    def __init__ (self, hbhost, debug):
        self.debug = debug
        self.dbh = happybase.Connection(hbhost)
        t = self.dbh.tables()

        self.table = self.dbh.table('infrastructure_botnet')
        self.kickit = threading.Semaphore(0)
        self.proc_thread = threading.Thread(target=self.run, args=())
        self.proc_thread.start()
        
        self.botnet_handler = Botnet.Botnet(self.dbh, debug)
        self.registry = Registry(hbhost, debug)
        self.num_servers = self.registry.get('hadoop.num_servers')
        if self.num_servers == None:
            self.num_servers = 1
            
        self.salt = Salt(self.num_servers, self.debug)
Beispiel #7
0
 def __init__(self, connectionPool, num_servers = 1, thread_tracker = None, debug = 0):
     self.debug = debug
     self.pool = connectionPool
     
     self.log = Log(connectionPool)
     
     self.L("cif-db Purger initializing")
     
     if thread_tracker == None:
         raise Exception("thread_tracker parameter can not be None")
     
     self.thread_tracker = thread_tracker
     self.registry = Registry(connectionPool, debug)
     
     self.primary_index = PrimaryIndex(connectionPool)
     self.secondary_index = SecondaryIndex(connectionPool)
     
     self.num_servers = self.registry.get('hadoop.num_servers')
     if self.num_servers == None:
         self.num_servers = 1 
         
     self.purge_every = self.expand_timespec(self.registry.get('index.purge_every'))
     if self.purge_every == None:
         self.purge_every = 24 * 60 * 60
     self.L("Purger will run every " + str(self.purge_every) + " seconds")
     
     self.prisecmap = []
     
     for i in self.registry.get():
         m = re.match(r'^index\.([^\.]+)\.([^\.]+)\.purge_after', i)
         if m != None:
             self.prisecmap[m.group(1)][m.group(2)] = self.expand_timespec(self.registry.get(i))
         
     self.workers = []
     for server in range(0, self.num_servers):
         thr_title = "Purger daemon %d of %d" % (server, self.num_servers-1)
         worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,))
         self.workers.append(worker_thr)
         worker_thr.daemon = True
         worker_thr.start()
         while not worker_thr.isAlive():
             self.log.L("waiting for purger/worker thread to become alive")
             time.sleep(1)
         self.L(thr_title)
         self.thread_tracker.add(id=worker_thr.ident, user='******', host=socket.gethostname(), state='Running', info=thr_title)
Beispiel #8
0
    def __init__(self, connectionPool, thread_tracker, debug):
        self.debug = debug
        self.pool = connectionPool

        if thread_tracker == None:
            raise Exception("thread_tracker parameter can not be None")

        self.thread_tracker = thread_tracker

        self.registry = Registry(connectionPool, debug)
        self.num_servers = self.registry.get("hadoop.num_servers")
        if self.num_servers == None:
            self.num_servers = 1

        self.batch_size = self.registry.get("hbase.batch_size")
        if self.batch_size == None:
            self.batch_size = 1000

        """
        We create one exploder thread per hbase server. Each thread has its own
        hbase connection.  
        
        foreach server (1 .. numservers)
            spawn_exploder_thread(server)
        """

        self.workers = []
        for server in range(0, self.num_servers):
            thr_title = "Exploder daemon %d of %d" % (server, self.num_servers - 1)
            worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,))
            self.workers.append(worker_thr)
            worker_thr.daemon = True
            worker_thr.start()
            while not worker_thr.isAlive():
                print "waiting for exploder/worker thread to become alive"
                time.sleep(1)
            self.thread_tracker.add(
                id=worker_thr.ident, user="******", host=socket.gethostname(), state="Running", info=thr_title
            )
Beispiel #9
0
class SecondaryIndex(object):
    """
    The secondary index is the table name (index_botnet, index_malware) and 
    corresponds to the second part of the query string. 
    
    infrastructure/botnet
    
    pri = infrastructure (ipv4 and ipv6)
    sec = botnet
    """
    def __init__ (self, connectionPool, debug=0):
        self.debug = debug
        self.pool = connectionPool

        self.registry = Registry(connectionPool, debug)
        self.names_list = []
        self.names_dict = {}
        
        self.load_secondary_index_map()

    def exists(self, name):
        if name in self.names_dict:
            return True
        return False

    def names(self):
        return self.names_list

    def load_secondary_index_map(self):
        siv = self.registry.get('index.secondary')
        if siv != None:
            self.names_list = []
            self.names_dict = {}
            for i in re.split(',', siv):
                n = i.lstrip().rstrip()
                self.names_list.append(n)
                self.names_dict[n] = 1
Beispiel #10
0
class Purger(object):
    """
    Eventually, this will submit map reduce jobs. Since we have to do 
    what amounts to full table scans, that's the best way to do it 
    using hadoop. For this POC, it doesn't use MR.
    
    outline:
    
    load index.* registry values
    index.purge_every tells us how long to sleep for between MR submissions
    index.primary.secondary.purge_after tells us the max age of records we'll keep
    index.purge_after is the default if no pri.sec is specified
    
    eg
    
    index.purge_every = 24h
    index.purge_after = 7d 
    index.infrastructure.botnet.purge_after = 10d 
    
    spawn a thread per server
    record them in threadtracker

    """
    def __init__(self, connectionPool, num_servers = 1, thread_tracker = None, debug = 0):
        self.debug = debug
        self.pool = connectionPool
        
        self.log = Log(connectionPool)
        
        self.L("cif-db Purger initializing")
        
        if thread_tracker == None:
            raise Exception("thread_tracker parameter can not be None")
        
        self.thread_tracker = thread_tracker
        self.registry = Registry(connectionPool, debug)
        
        self.primary_index = PrimaryIndex(connectionPool)
        self.secondary_index = SecondaryIndex(connectionPool)
        
        self.num_servers = self.registry.get('hadoop.num_servers')
        if self.num_servers == None:
            self.num_servers = 1 
            
        self.purge_every = self.expand_timespec(self.registry.get('index.purge_every'))
        if self.purge_every == None:
            self.purge_every = 24 * 60 * 60
        self.L("Purger will run every " + str(self.purge_every) + " seconds")
        
        self.prisecmap = []
        
        for i in self.registry.get():
            m = re.match(r'^index\.([^\.]+)\.([^\.]+)\.purge_after', i)
            if m != None:
                self.prisecmap[m.group(1)][m.group(2)] = self.expand_timespec(self.registry.get(i))
            
        self.workers = []
        for server in range(0, self.num_servers):
            thr_title = "Purger daemon %d of %d" % (server, self.num_servers-1)
            worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,))
            self.workers.append(worker_thr)
            worker_thr.daemon = True
            worker_thr.start()
            while not worker_thr.isAlive():
                self.log.L("waiting for purger/worker thread to become alive")
                time.sleep(1)
            self.L(thr_title)
            self.thread_tracker.add(id=worker_thr.ident, user='******', host=socket.gethostname(), state='Running', info=thr_title)
        
    def expand_timespec(self, tspec):
        """
        accepts: \d[dwh] and returns seconds
        """
        if tspec == None:
            return None
        m = re.match(r"^(\d+)([dwh])$", tspec)
        if m == None:
            self.L("invalid timespec: " + tspec)
            return None
        if m.group(2) == "d":
            return int(m.group(1)) * 24 * 60 * 60
        if m.group(2) == "w":
            return int(m.group(1)) * 7 * 24 * 60 * 60
        if m.group(2) == "h":
            return int(m.group(1)) * 60 * 60
        
    def remove_index_and_dereference(self, index_th, index_rowkey, co_tbl, index_table, document_rowkey):
        try:
            index_th.delete(index_rowkey)
            co_row = co_tbl.row(document_rowkey)
            fmt = "%ds" % (len(index_table) + 4)  # Also in Indexer
            prk = struct.pack(fmt, "cf:" + str(index_table) + "_") + document_rowkey
            if prk in co_row:
                co_tbl.delete(document_rowkey, columns=[prk])
        except Exception as e:
            self.L("Failed to delete reference and index: " + index_table + str(e) + traceback.format_exc(None))
            
    def run(self, server):
        """
        thread:
        
        forever:
            foreach sec: # eg botnet, phishing, whitelist
                foreach pri: # eg ipv4 ipv6 url
                    submit purge job(pri/sec)
                    record pri in a pri_list
                submit purge job(difference of the sets all_pris and pri_list / sec)
        """
        with self.pool.connection() as dbh:
            secondaries = set(self.secondary_index.names())
            primaries = set(self.primary_index.names())
    
            while True:
                pri_done = []
                for sec in secondaries:
                    for pri in primaries:
                        if self.primary_index.is_group(pri) == False:
                            self.submit_purge_job(dbh, pri, sec)
                        pri_done.append(pri)  # remove groups too
                    # pri_done is a subset of primaries
                    diff = primaries - set(pri_done)
                    if len(diff) > 0:
                        self.submit_purge_job(dbh, diff, sec)
                    
                time.sleep(self.purge_every)
                self.L("Purger awake after " + str(self.purge_every) + " seconds")
            
    def submit_purge_job(self, dbh, pri, sec):
        """
        future: submit a MR job
        current: just iterate
        
        FIX atm this is iodef specific, ideally we will handle other document types
        """
        self.L("begin purge of %s/%s" % (pri, sec))
        
        tables = dbh.tables()
        table_name = "index_" + sec
        
        if table_name in tables:
            tbl = dbh.table("index_" + sec)
            co_tbl = dbh.table("cif_objs")
            
            for i in range(0, self.num_servers):
                self.L("purging index_%s on server %d" %(sec, i))
                
                pri_enum = self.primary_index.enum(pri)
                if pri_enum != None:
                    rowpre = struct.pack(">HB", i, pri_enum)
                    oldest_allowed = self.lookup_max_lifespan(pri, sec)
                    for key, data in tbl.scan(row_prefix=rowpre, include_timestamp=True):

                        document_rowkey = None
                        data_age = None
                        if 'b:iodef_rowkey' in data:  # iodef handler
                            data_age = data['b:iodef_rowkey'][1]
                            document_rowkey = data['b:iodef_rowkey'][0]
                        #elif 'b:stiix_rowkey' in data: ... etc
                    
                        if time.time() - data_age < oldest_allowed:
                            # cif_objs.row(iodef_rowkey) will contain a column "cf:index_$sec_$thisrowkey" we want to delete that reference
                            self.remove_index_and_dereference(tbl, key, co_tbl, table_name, document_rowkey)
    
    def lookup_max_lifespan(self, pri, sec):
        return 86400
        if pri != None and sec != None:
            # index.$pri.$sec.purge_after
            rkey = "index.%s.%s.purge_after" % (pri, sec)
            rv = self.registry.get(rkey)
            if rv != None:
                return self.expand_timespec(rv)
            else:
                rv = self.registry.get("index.purge_after") # global fallback
                if rv != None:
                    return self.expand_timespec(rv)
        return self.expand_timespec("270d")  # hardcoded default
    
    def L(self, msg):
        caller =  ".".join([str(__name__), sys._getframe(1).f_code.co_name])
        if self.debug != None:
            print caller + ": " + msg
        else:
            self.log.L(caller + ": " + msg)
Beispiel #11
0
class PrimaryIndex(object):
	"""
	The primary index is the first part of the query string. Eg. "infrastructure" or "url".
	This corresponds to the third byte of the hbase rowkey. We allow for groups in the
	primary index. For example,
	
	ipv4 = 0
	ipv6 = 1
	infrastructure = ipv4,ipv6
	
	"""
	def __init__ (self, connectionPool, debug=0):
		self.debug = debug
		self.pool = connectionPool
		self.registry = Registry(connectionPool, debug)
		self.index_to_enum = {}  # name -> enum
		self.enum_to_index = {}  # enum -> name
    	
		self.load_primary_index_map()

	def names(self):
		"""
		Return all of the primary index names, including group names.
    	"""
		return self.index_to_enum.keys()

	def is_group(self, name):
		"""
		If the given name is a group, return True else False
		"""
		if name in self.index_to_enum:
			v = self.index_to_enum[name]
			if type(v) is not int:
				return True
		return False
	
	def reduce_group(self, name):
		"""
		If the given name is a group, return [group member names]
		else return [name]
		"""
		if name in self.index_to_enum:
			v = self.index_to_enum[name]
			if type(v) is int:
				return [name]

		rv = []

		for innername in re.split(',', self.index_to_enum[name]):
			rv.append(innername.lstrip().rstrip())
			
		return rv
	
	def enum(self, name):
		"""
		Return the enum value(s) for the given primary index name.
		This function returns a list. In the case where the given index name
		is a group, multiple enum values will be returned.
    	""" 
		enums = []
		if name in self.index_to_enum:
			v = self.index_to_enum[name]
			if type(v) is int:
				return v
			else:
				for innername in re.split(',', v):
					enums.append(self.enum(innername.lstrip().rstrip()))

		return enums

	def name(self, enum):
		"""
		Given an index enumeration value, return the name of the index
		"""
		if enum in self.enum_to_index:
			return self.enum_to_index[enum]
		return None

	def load_primary_index_map(self):	    
		for reg_key in self.registry.get():
			reg_val = self.registry.get(reg_key)
			if re.match('^index.primary.', reg_key):
				x = re.split('\.', reg_key)
				self.index_to_enum[x[2]] = reg_val
				if type(reg_val) is int:
					self.enum_to_index[reg_val] = x[2]
Beispiel #12
0
class Query(object):
    def __init__ (self, connectionPool, p_index, s_index, debug):
        self.debug = debug
        self.primary_index = p_index
        self.secondary_index = s_index
        self.pool = connectionPool
        
        try:
            self.registry = Registry(connectionPool, debug)
            self.num_servers = self.registry.get('hadoop.num_servers')
            
            if self.num_servers == None:
                self.num_servers = 1

            self.packers = {}
            
            for packer in self.primary_index.names():
                try:
                    package='DB.PrimaryIndex.PackUnpack'
                    self.L("loading packer " + package + "." + packer)
                    __import__(package + "." + packer)
                    pkg = sys.modules[package + "." + packer]
                    self.packers[packer] = getattr(pkg, packer)
                except ImportError as e:
                    self.L("warning: failed to load " + packer)
                  
            with self.pool.connection() as dbh:
                self.tbl_co = dbh.table('cif_objs')
                self.available_tables = dbh.tables()
            
        except Exception as e:
            self.L("failed to open tables")
            print e
            raise

    def L(self, msg):
       caller =  ".".join([str(__name__), sys._getframe(1).f_code.co_name])
       if self.debug != None and self.debug > 0:
           print caller + ": " + msg
       else:
           syslog.syslog(caller + ": " + msg)
    
    def decode_query(self, qstring):
        """
        Given a query string, return a dictionary containing:
        
        { 'primary' : [INTEGER COUPLE],
          'prinames' : [STRING COUPLE],
          'secondary' : STRING,
          'limiter' : { 'type' : INTEGER, 
                        'value' : STR
                        }
        }
        
        eg: (infra = ipv4, ipv6 = 0, 1)
        
        infrastructure/botnet
        
        { 'primary' : [0,1], 'secondary' : 'botnet', 'limiter' : None }
        
        infrastructure/botnet,10.10.0.0/16
        
        { 'primary' : [0,1], 'secondary' : 'botnet', 'limiter' : { 'type' : 0, 'value' : '10.10.0.0/16' } }
        
        Where 'type', above, is a guess based on the types of things we expect to be queried for:
        IP addresses, domain names, email addresses, URLs
        
        What can we do with this? We can open the correct secondary index table. We can pack the rowkey
        based on the primary index. If the primary index is a couple, we set a start and stop rowkey. 
        If it's only a single value, we use it as a row prefix. If we have a limiter, we pack it based on 
        its type. 
        
        """
        
        rv = {}
        
        if re.match(r'^[a-z0-9]+/[a-z0-9]+$', qstring, flags=re.IGNORECASE):
            # "primary/secondary" only 
            
            indexparts = re.split('/', qstring)
            
            if len(indexparts) != 2:
                raise Exception("Query prefix not in the form of index1/index2")
            
            pi_enum = self.primary_index.enum(indexparts[0])
            
            if type(pi_enum) is int:
                pi_enum = [pi_enum]  # primary was not a group, so we only got a single enum back
                
            if len(pi_enum) > 0 and self.secondary_index.exists(indexparts[1]) == True:
                rv['primary'] = pi_enum
                rv['prinames'] = self.primary_index.reduce_group(indexparts[0])
                rv['secondary'] = indexparts[1]
                rv['limiter'] = { 'type' : None, 'value' : None }
            
        elif re.match(r'^[a-z0-9]+/[a-z0-9]+,', qstring, flags=re.IGNORECASE):
            # "primary/secondary,limiter" both specified

            qparts = re.split(',', qstring)
            
            if len(qparts) > 2:
                qparts[1] = qparts[1:].join('')
                del qparts[2:]
            
            indexparts = re.split('/', qparts[0])
            
            if len(indexparts) != 2:
                raise "Query prefix not in the form of index1/index2"
            
            pi_enum = self.primary_index.enum(indexparts[0])
            
            if type(pi_enum) is int:
                pi_enum = [pi_enum]  # primary was not a group, so we only got a single enum back
                
            limit_enum = self.guesstypeof(qparts[1])
            
            # make sure they didn't give us, eg, an email limiter for a ipv4 primary index
            
            if not limit_enum in pi_enum:
                raise Exception("Limiter mismatched with primary index")
        
            pi_enum = [limit_enum]
            
            if len(pi_enum) > 0 and self.secondary_index.exists(indexparts[1]) == True:
                rv['primary'] = pi_enum
                rv['prinames'] = self.primary_index.name(limit_enum)
                rv['secondary'] = indexparts[1]
                rv['limiter'] = { 'type' : self.guesstypeof(qparts[1]), 'value' : qparts[1] }
        
        else:
            # "limiter" only specified
            
            rv['primary'] = [self.guesstypeof(qstring)]
            rv['prinames'] = self.primary_index.name(self.guesstypeof(qstring))
            rv['secondary'] = None
            rv['limiter'] = { 'type' : self.guesstypeof(qstring), 'value' : qstring }
        
        return rv
    
    def guesstypeof(self, s):
        """
        Try to figure out which primary index apply to the given string.
        ipv4, ipv6, url, email, domain
        
        This information is useful when we get a limiter with no pri/sec hints. So 
        if the query is "10.10.0.0" we want to know that it's an ipv4 address so we can
        construct the start and stop rowkey appropriately.
        """

        try:
            ipv = IP(s).version()
            if ipv == 4:
                return self.primary_index.enum('ipv4')
            if ipv == 6:
                return self.primary_index.enum('ipv6')
        except ValueError as e:
            try:
                o = urlparse(s)

                # a hash, eg 10299abe93984f8e8d8e9f
                if o.scheme == '' and re.match(r'^[0-9a-f]+$', o.path, flags=re.IGNORECASE) != None:
                    return self.primary_index.enum('malware')
                
                # an email, [email protected]
                if o.scheme == '' and re.search(r'@', o.path) != None:
                    return self.primary_index.enum('email')

                # a domainname
                if o.scheme == '' and re.match(r'^[a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9]\.[a-zA-Z]{2,}$', o.path) != None:
                    return self.primary_index.enum('domain')
                
                # a url
                if o.scheme != '':
                    return self.primary_index.enum('url')
                
                # an asn
                if o.scheme == '' and re.match(r'^[\d+]$', o.path) != None:
                    return self.primary_index.enum('asn')
                
            except ValueError as e:
                return self.primary_index.enum('search')

        return self.primary_index.enum('search')
    
    def setqr(self, qr):
        self.qr = qr

    def ipv4_to_start_end_ints(self, v4):
        """
        Given (possibly) a cidr block, return the start addr and
        end addr (plus one) as ints. if no mask given, end = start
        """
        p = v4.split('/')
        octs = p[0].split('.')
        if len(octs) != 4:
            self.L ("ipv4_to_start_end_ints: Invalid parameter: " + str(v4))
            return 0

        for i in range(0,4):
            octs[i] = int(octs[i])

        start = octs[0] << 24 | octs[1] << 16 | octs[2] << 8 | octs[3]
        if len(p) == 2:
            maskbits = int(p[1])
            if int(maskbits) < 0 or int(maskbits) > 32:
                self.L ("ipv4_to_start_end_ints: Invalid bitmask: " + maskbits)
                return 0

            mask = 2**maskbits - 1
            end = start | mask
            end = end + 1
        else:
            end = start
        return [start, end]
        
    def setlimit(self, limit):
        self.limit = limit

    """
    we will fetch up to self.limit records matching the query, pack them into
    iodef documents, insert them into the QueryResponse and return that. 
    
    that object (the QR) will be placed back into the control message and sent
    back to the client from which it came.
    
    infra/botnet
    infra/malware
    infra/scan
    domain/botnet
    domain/malware
    url/botnet
    url/malware
    url/phishing
    
    <2 byte salt>
        ipv4    = 0x0   (infrastructure/botnet)
        ipv6    = 0x1   (infrastructure/botnet)
        fqdn    = 0x2   (domain/botnet)
        url     = 0x3   (url/botnet)
        email   = 0x4   (email/botnet)
        search  = 0x5   (search/botnet)
        malware = 0x6   (malware/botnet)
        asn     = 0x7   (asn/botnet)
    
    so to query for all infra_botnet, thread out for each salt (one thread per salt val) and 
    append 0x0 or 0x1 

    if they give a specific netblock or ip, append that as well

    
    for domain_botnet, one thread per salt and append 0x2, for a specific domain, append
    the length (2 bytes) and then the domain
    
    """
    def execqr(self):
        self.L("execute query: " + self.qr.query)

        try:
            decoded_query = self.decode_query(self.qr.query)

            # infrastructure/botnet,[email protected]   {'limiter': {'type': 4, 'value': u'*****@*****.**'}, 'primary': [0, 1], 'secondary': u'malware'}
            #     result: invalid/mismatched limiter/primary
            
            # infrastructure/botnet  {'limiter': {'type': None, 'value': None}, 'secondary': u'botnet', 'primary': [0, 1], 'prinames': ['ipv4', 'ipv6']}
            #     result: valid, query index_botnet for all ipv4/ipv6 rows
            
            # 10.10.0.0/16  {'limiter': {'type': 0, 'value': u'10.10.0.0/16'}, 'secondary': None, 'primary': None, 'prinames': None}
            #     result: valid, query all secondaries for primary type '0' and pack 10.10.0.0 onto the start and 10.10.255.255 onto the end rowkey
            
            # open table index_$secondary
            #
            # if len(primary) is 2:
            #     pack start rowkey using primary[0]
            #     pack stop rowkey using primary[1]
            # else
            #     pack rowprefix using primary[0]
            #
            # if we have a limiter, pack it into the end of the rowkey
            #   len(primary) must be 1 if we have a limiter
            
            # if stop rowkey != none then use scan(start=,stop=)
            # else use scan(rowprefix=)
            
            secondaries_to_scan = []
            if 'secondary' in decoded_query and decoded_query['secondary'] != None:
                secondaries_to_scan.append(decoded_query['secondary'])
            else:
                secondaries = self.registry.get('index.secondary')
                secondaries_to_scan = re.sub(r'\s*', r'', secondaries).split(',')
            
            qrs = control_pb2.QueryResponse()
            
            # TODO: spawn a thread for each secondary to scan, coalesce results
            # TODO: spawn a thread for each salt to scan, coalesce results
            
            qrs.ReportTime = datetime.datetime.now().isoformat(' ')
            qrs.description = self.qr.query
        
            with self.pool.connection() as dbh:
                for server in range(0, self.num_servers):
                    for secondary in secondaries_to_scan:
                        
                        table_name = "index_" + secondary
                        if not table_name in self.available_tables:
                            continue
                        table = dbh.table(table_name)
                        
                        if decoded_query['primary'] != None:
                            if len(decoded_query['primary']) == 1:
                                rowprefix = struct.pack('>HB', server, decoded_query['primary'][0])
    
                                # limiter/type and limiter/value are always present but may be None
                                if decoded_query['limiter']['type'] != None:
                                    packer = self.primary_index.name(decoded_query['limiter']['type']) # use 'prinames' instead of this lookup
                                    rowprefix = rowprefix + self.packers[packer].pack(decoded_query['limiter']['value'])
                                
                                for key, value in table.scan(row_prefix=rowprefix):
                                    iodef_rowkey = value['b:iodef_rowkey']
                                    iodef_row = self.tbl_co.row(iodef_rowkey)
                                    
                                    for key in iodef_row:
                                        value = iodef_row[key]
                                        if re.match(r'cf:index_', key) == None:
                                            bot = (key.split(":"))[1]
                                            qrs.baseObjectType.append(bot)
                                            qrs.data.append(value)
                                            break

                        
                            elif len(decoded_query['primary']) == 2:
                                
                                startrow = struct.pack('>HB', server, decoded_query['primary'][0])
                                stoprow = struct.pack('>HB', server, decoded_query['primary'][1])
    
                                if decoded_query['limiter']['type'] != None:
                                    print "limiter given of type " + self.primary_index.name(decoded_query['limiter']['type'])
                                    print "we shouldnt get here"
                                    
                                for key, value in table.scan(row_start=startrow, row_stop=stoprow):
                                    iodef_rowkey = value['b:iodef_rowkey']
                                    iodef_row = self.tbl_co.row(iodef_rowkey)
                                    
                                    for key in iodef_row:
                                        value = iodef_row[key]
                                        if re.match(r'cf:index_', key) == None:
                                            bot = (key.split(":"))[1]
                                            qrs.baseObjectType.append(bot)
                                            qrs.data.append(value)
                                            break
                                    
                        elif decoded_query['primary'] == None:
                                print "no primary given case"
                                print "we shouldnt get here"
                
            return qrs
        
        except Exception as e:
            print e
            traceback.print_exc(file=sys.stdout)
            raise e
Beispiel #13
0
class Exploder(object):
    def __init__ (self, connectionPool, thread_tracker, debug):
        self.debug = debug
        self.pool = connectionPool
        
        if thread_tracker == None:
            raise Exception("thread_tracker parameter can not be None")
        
        self.thread_tracker = thread_tracker
        
        self.registry = Registry(connectionPool, debug)
        self.num_servers = self.registry.get('hadoop.num_servers')
        if self.num_servers == None:
            self.num_servers = 1
        
        self.batch_size = self.registry.get('hbase.batch_size')
        if self.batch_size == None:
            self.batch_size = 1000
            
        """
        We create one exploder thread per hbase server. Each thread has its own
        hbase connection.  
        
        foreach server (1 .. numservers)
            spawn_exploder_thread(server)
        """
        
        self.workers = []
        for server in range(0, self.num_servers):
            thr_title = "Exploder daemon %d of %d" % (server, self.num_servers-1)
            worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,))
            self.workers.append(worker_thr)
            worker_thr.daemon = True
            worker_thr.start()
            while not worker_thr.isAlive():
                print "waiting for exploder/worker thread to become alive"
                time.sleep(1)
            self.thread_tracker.add(id=worker_thr.ident, user='******', host=socket.gethostname(), state='Running', info=thr_title)
        

    def L(self, msg):
        caller =  ".".join([str(__name__), sys._getframe(1).f_code.co_name])
        if self.debug != None:
            print caller + ": " + msg
        else:
            syslog.syslog(caller + ": " + msg)
    
    def do_some_work(self):
        self.kickit.release()
        
    def getcheckpoint(self, salt):
        t = self.registry.get('exploder.checkpoint.' + str(salt))
        if t != None:
            return t
        return 0
    
    def setcheckpoint(self, salt, ts):
        self.registry.set('exploder.checkpoint.' + str(salt), ts)
    
    def run(self, salt):
        """
        run(salt)
        
        this routine scans the cif_obj db for rows starting at
          "salt" + last checkpoint timestamp
        and ending at row
          "salt" + now()
        
        each row read in is passed to the Indexer for indexing.
        """
        
        self.L("Exploder thread running for salt: " + str(salt))
        
        with self.pool.connection() as dbh:
    
            index_handler = {} # Indexer.Indexer(self.dbh, "botnet", self.num_servers, self.debug)
            
            while True:
                co = dbh.table('cif_objs')
                
                startts = self.getcheckpoint(salt)
                endts = int(time.time())
                processed = 0
    
                #self.L("processing: " + str(startts) + " to " + str(endts))
                
                if startts == 0:
                    startts = 1
                    
                srowid = struct.pack(">HIIIII", salt, startts-1, 0,0,0,0)
                erowid = struct.pack(">HIIIII", salt, endts, 0,0,0,0)
    
                for key, data in co.scan(row_start=srowid, row_stop=erowid):
                    contains = data.keys()[0]
                    obj_data = data[contains]
                    
                    if contains == "cf:RFC5070_IODEF_v1_pb2":
                        iodef = RFC5070_IODEF_v1_pb2.IODEF_DocumentType()
                        try:
                            iodef.ParseFromString(obj_data)
    
                            #print "IODEF: ", iodef
                            ii = iodef.Incident[0]
                            table_type = ii.Assessment[0].Impact[0].content.content
                            
                            self.L("\tIndexing: " + table_type)
                    
                            # check to make sure table_name is in index.secondary
                            #   index.secondary contains a list of configured/permitted secondary index types
                            
                            if not table_type in index_handler:
                                self.L("index handler for table type %s doesnt exist, creating a new handler thread" % (table_type))
                                index_handler[table_type] = Indexer.Indexer(self.pool, table_type, self.num_servers, self.batch_size, self.debug)
                            
                            index_handler[table_type].extract(key, iodef)
                            processed = processed + 1
    
                        except Exception as e:
                            print "Failed to parse restored object: ", e
                            traceback.print_exc()
                    else:
                        print "Contains an unsupported object type: ", contains
                    
                time.sleep(5)
                if processed > 0:
                    self.setcheckpoint(salt, endts)
Beispiel #14
0
global cf
global exploder
global primary_index
global secondary_index
global thread_tracker

try:
    
    print "Connect to HBase"
    connectionPool = HBConnection(hbhost)
    with connectionPool.connection() as connection:
        cif_objs = connection.table('cif_objs').batch(batch_size=5) # set very low for development, set to 1000+ for test/qa/prod
        cif_idl = connection.table('cif_idl')
        
        print "Init Registry"
        registry = Registry(connectionPool, debug)
        num_servers = registry.get('hadoop.num_servers')
        if num_servers == None:
            num_servers = 1
            print "hadoop.num_servers not set. defaulting."
        print "hadoop.num_servers = ", num_servers
        salt = Salt(num_servers, debug)
    
        thread_tracker = ThreadTracker(debug)
        
        global apikeys
        
        log = Log(connectionPool)
        log.L("cif-db initializing")
        
        print "Initializing APIKeys object"
Beispiel #15
0
class Exploder(object):
    def __init__ (self, hbhost, debug):
        self.debug = debug
        self.dbh = happybase.Connection(hbhost)
        t = self.dbh.tables()

        self.table = self.dbh.table('infrastructure_botnet')
        self.kickit = threading.Semaphore(0)
        self.proc_thread = threading.Thread(target=self.run, args=())
        self.proc_thread.start()
        
        self.botnet_handler = Botnet.Botnet(self.dbh, debug)
        self.registry = Registry(hbhost, debug)
        self.num_servers = self.registry.get('hadoop.num_servers')
        if self.num_servers == None:
            self.num_servers = 1
            
        self.salt = Salt(self.num_servers, self.debug)
        
        
    def L(self, msg):
        caller =  ".".join([str(__name__), sys._getframe(1).f_code.co_name])
        if self.debug != None:
            print caller + ": " + msg
        else:
            syslog.syslog(caller + ": " + msg)
    
    def do_some_work(self):
        self.kickit.release()
        
    def getcheckpoint(self):
        t = self.registry.get('exploder.checkpoint')
        if t != None:
            return t
        return 0
    
    def setcheckpoint(self, ts):
        self.registry.set('exploder.checkpoint', ts)
    
    def run(self):
        self.L("Exploder running")
        
        while True:
            self.L("waiting for work")
            self.kickit.acquire() # will block provided kickit is 0
            self.L("wakup")
            
            co = self.dbh.table('cif_objs')
            
            self.L("connected to cif_objs")
            
            startts = self.getcheckpoint()
            endts = int(time.time())
            processed = 0

            self.L("processing: " + str(startts) + " to " + str(endts))
            
            salt = 0xFF00  # FIX fix in poc-db at the same time (in writeToDb())
            srowid = struct.pack(">HIIIII", salt, startts, 0,0,0,0)
            erowid = struct.pack(">HIIIII", salt, endts, 0,0,0,0)

            for key, data in co.scan(row_start=srowid, row_stop=erowid):
                contains = data.keys()[0]
                obj_data = data[contains]
                
                if contains == "cf:RFC5070_IODEF_v1_pb2":
                    iodef = RFC5070_IODEF_v1_pb2.IODEF_DocumentType()
                    try:
                        iodef.ParseFromString(obj_data)

                        #print iodef
                        ii = iodef.Incident[0]
                        table_type = ii.Assessment[0].Impact[0].content.content
                        rowkey = None
                        
                        if table_type == "botnet":
                            self.L("botnet")
                            self.botnet_handler.extract(iodef)
                            self.botnet_handler.commit()
                            
                        elif table_type == "malware":
                            self.L("malware")
                                
                    except Exception as e:
                        print "Failed to parse restored object: ", e
                        traceback.print_exc()

    
            self.setcheckpoint(endts+1)
Beispiel #16
0
     if o == "-t":
         key_type = a
     elif o == "-v":
         key_value = a
     elif o == "-k":
         key_name = a
     elif o == "-d":
         del_name = a
     elif o == "-h":
         usage()
         sys.exit(2)
     elif o == "-D":
         debug = a
 
 
 reg = Registry("localhost", debug)
 
 if del_name != None:
     reg.delete(del_name)
     kv = reg.get(del_name)
     if kv != None:
         print "Failed to delete the key: it seems to still be in the database."
         
 elif key_name != None:
     if key_type != None and key_value != None:
         key_value = cast(key_type, key_value)
         reg.set(key_name, key_value)
         kv = reg.get(key_name)
         if kv == key_value:
             print key_name + " has been set to " + str(key_value)
         else:
Beispiel #17
0
except getopt.GetoptError, err:
    print str(err)
    usage()
    sys.exit(2)

hbhost = "localhost"

for o, a in opts:
    if o == "-h":
        usage()
        sys.exit(2)
    elif o == "-H":
        hbhost = a

c = HBConnection(hbhost)
registry = Registry(hbhost, False)
num_servers = registry.get("hadoop.num_servers")
if num_servers == None:
    num_servers = 1

primary_index_map = load_primary_index_map(registry)


for table in c.tables():
    if re.match("^index_", table):
        print "\nChecking: ", table

        index_entries_per_server = {}
        index_entries_per_primary_index = {}

        th = c.table(table)