Beispiel #1
0
 def __init__(self, connectionPool, num_servers = 1, thread_tracker = None, debug = 0):
     self.debug = debug
     self.pool = connectionPool
     
     self.log = Log(connectionPool)
     
     self.L("cif-db Purger initializing")
     
     if thread_tracker == None:
         raise Exception("thread_tracker parameter can not be None")
     
     self.thread_tracker = thread_tracker
     self.registry = Registry(connectionPool, debug)
     
     self.primary_index = PrimaryIndex(connectionPool)
     self.secondary_index = SecondaryIndex(connectionPool)
     
     self.num_servers = self.registry.get('hadoop.num_servers')
     if self.num_servers == None:
         self.num_servers = 1 
         
     self.purge_every = self.expand_timespec(self.registry.get('index.purge_every'))
     if self.purge_every == None:
         self.purge_every = 24 * 60 * 60
     self.L("Purger will run every " + str(self.purge_every) + " seconds")
     
     self.prisecmap = []
     
     for i in self.registry.get():
         m = re.match(r'^index\.([^\.]+)\.([^\.]+)\.purge_after', i)
         if m != None:
             self.prisecmap[m.group(1)][m.group(2)] = self.expand_timespec(self.registry.get(i))
         
     self.workers = []
     for server in range(0, self.num_servers):
         thr_title = "Purger daemon %d of %d" % (server, self.num_servers-1)
         worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,))
         self.workers.append(worker_thr)
         worker_thr.daemon = True
         worker_thr.start()
         while not worker_thr.isAlive():
             self.log.L("waiting for purger/worker thread to become alive")
             time.sleep(1)
         self.L(thr_title)
         self.thread_tracker.add(id=worker_thr.ident, user='******', host=socket.gethostname(), state='Running', info=thr_title)
Beispiel #2
0
 def __init__ (self, connectionPool, index_type, num_servers = 1, table_batch_size = 1000, debug = 0):
     self.debug = debug
     print "indexer connect"
     self.pool = connectionPool
     print "indexer load primary index map"
     self.primary_index = PrimaryIndex(connectionPool, debug)
     print "index init log"
     self.log = Log(connectionPool)
     
     self.num_servers = num_servers
     self.packers = {}
     
     for packer in self.primary_index.names():
         try:
             package='DB.PrimaryIndex.PackUnpack'
             self.L("loading packer " + package + "." + packer)
             __import__(package + "." + packer)
             pkg = sys.modules[package + "." + packer]
             self.packers[packer] = getattr(pkg, packer)
         except ImportError as e:
             self.L("warning: failed to load " + packer)
                 
     with self.pool.connection() as dbh:
         t = dbh.tables()
         
         self.table_name = "index_" + index_type
         
         if not self.table_name in t:
             self.L("index table %s doesnt exist, creating it" % (self.table_name))
             dbh.create_table(self.table_name, {'b': {'COMPRESSION': 'SNAPPY'}})
         
         table_batch_size = 5
         
         self.table = dbh.table(self.table_name).batch(batch_size=table_batch_size)
         self.co_table = dbh.table("cif_objs").batch(batch_size=table_batch_size)
         
         self.reset()
         self.md5 = hashlib.md5()
         self.salt = Salt(self.num_servers, self.debug)
Beispiel #3
0
class Purger(object):
    """
    Eventually, this will submit map reduce jobs. Since we have to do 
    what amounts to full table scans, that's the best way to do it 
    using hadoop. For this POC, it doesn't use MR.
    
    outline:
    
    load index.* registry values
    index.purge_every tells us how long to sleep for between MR submissions
    index.primary.secondary.purge_after tells us the max age of records we'll keep
    index.purge_after is the default if no pri.sec is specified
    
    eg
    
    index.purge_every = 24h
    index.purge_after = 7d 
    index.infrastructure.botnet.purge_after = 10d 
    
    spawn a thread per server
    record them in threadtracker

    """
    def __init__(self, connectionPool, num_servers = 1, thread_tracker = None, debug = 0):
        self.debug = debug
        self.pool = connectionPool
        
        self.log = Log(connectionPool)
        
        self.L("cif-db Purger initializing")
        
        if thread_tracker == None:
            raise Exception("thread_tracker parameter can not be None")
        
        self.thread_tracker = thread_tracker
        self.registry = Registry(connectionPool, debug)
        
        self.primary_index = PrimaryIndex(connectionPool)
        self.secondary_index = SecondaryIndex(connectionPool)
        
        self.num_servers = self.registry.get('hadoop.num_servers')
        if self.num_servers == None:
            self.num_servers = 1 
            
        self.purge_every = self.expand_timespec(self.registry.get('index.purge_every'))
        if self.purge_every == None:
            self.purge_every = 24 * 60 * 60
        self.L("Purger will run every " + str(self.purge_every) + " seconds")
        
        self.prisecmap = []
        
        for i in self.registry.get():
            m = re.match(r'^index\.([^\.]+)\.([^\.]+)\.purge_after', i)
            if m != None:
                self.prisecmap[m.group(1)][m.group(2)] = self.expand_timespec(self.registry.get(i))
            
        self.workers = []
        for server in range(0, self.num_servers):
            thr_title = "Purger daemon %d of %d" % (server, self.num_servers-1)
            worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,))
            self.workers.append(worker_thr)
            worker_thr.daemon = True
            worker_thr.start()
            while not worker_thr.isAlive():
                self.log.L("waiting for purger/worker thread to become alive")
                time.sleep(1)
            self.L(thr_title)
            self.thread_tracker.add(id=worker_thr.ident, user='******', host=socket.gethostname(), state='Running', info=thr_title)
        
    def expand_timespec(self, tspec):
        """
        accepts: \d[dwh] and returns seconds
        """
        if tspec == None:
            return None
        m = re.match(r"^(\d+)([dwh])$", tspec)
        if m == None:
            self.L("invalid timespec: " + tspec)
            return None
        if m.group(2) == "d":
            return int(m.group(1)) * 24 * 60 * 60
        if m.group(2) == "w":
            return int(m.group(1)) * 7 * 24 * 60 * 60
        if m.group(2) == "h":
            return int(m.group(1)) * 60 * 60
        
    def remove_index_and_dereference(self, index_th, index_rowkey, co_tbl, index_table, document_rowkey):
        try:
            index_th.delete(index_rowkey)
            co_row = co_tbl.row(document_rowkey)
            fmt = "%ds" % (len(index_table) + 4)  # Also in Indexer
            prk = struct.pack(fmt, "cf:" + str(index_table) + "_") + document_rowkey
            if prk in co_row:
                co_tbl.delete(document_rowkey, columns=[prk])
        except Exception as e:
            self.L("Failed to delete reference and index: " + index_table + str(e) + traceback.format_exc(None))
            
    def run(self, server):
        """
        thread:
        
        forever:
            foreach sec: # eg botnet, phishing, whitelist
                foreach pri: # eg ipv4 ipv6 url
                    submit purge job(pri/sec)
                    record pri in a pri_list
                submit purge job(difference of the sets all_pris and pri_list / sec)
        """
        with self.pool.connection() as dbh:
            secondaries = set(self.secondary_index.names())
            primaries = set(self.primary_index.names())
    
            while True:
                pri_done = []
                for sec in secondaries:
                    for pri in primaries:
                        if self.primary_index.is_group(pri) == False:
                            self.submit_purge_job(dbh, pri, sec)
                        pri_done.append(pri)  # remove groups too
                    # pri_done is a subset of primaries
                    diff = primaries - set(pri_done)
                    if len(diff) > 0:
                        self.submit_purge_job(dbh, diff, sec)
                    
                time.sleep(self.purge_every)
                self.L("Purger awake after " + str(self.purge_every) + " seconds")
            
    def submit_purge_job(self, dbh, pri, sec):
        """
        future: submit a MR job
        current: just iterate
        
        FIX atm this is iodef specific, ideally we will handle other document types
        """
        self.L("begin purge of %s/%s" % (pri, sec))
        
        tables = dbh.tables()
        table_name = "index_" + sec
        
        if table_name in tables:
            tbl = dbh.table("index_" + sec)
            co_tbl = dbh.table("cif_objs")
            
            for i in range(0, self.num_servers):
                self.L("purging index_%s on server %d" %(sec, i))
                
                pri_enum = self.primary_index.enum(pri)
                if pri_enum != None:
                    rowpre = struct.pack(">HB", i, pri_enum)
                    oldest_allowed = self.lookup_max_lifespan(pri, sec)
                    for key, data in tbl.scan(row_prefix=rowpre, include_timestamp=True):

                        document_rowkey = None
                        data_age = None
                        if 'b:iodef_rowkey' in data:  # iodef handler
                            data_age = data['b:iodef_rowkey'][1]
                            document_rowkey = data['b:iodef_rowkey'][0]
                        #elif 'b:stiix_rowkey' in data: ... etc
                    
                        if time.time() - data_age < oldest_allowed:
                            # cif_objs.row(iodef_rowkey) will contain a column "cf:index_$sec_$thisrowkey" we want to delete that reference
                            self.remove_index_and_dereference(tbl, key, co_tbl, table_name, document_rowkey)
    
    def lookup_max_lifespan(self, pri, sec):
        return 86400
        if pri != None and sec != None:
            # index.$pri.$sec.purge_after
            rkey = "index.%s.%s.purge_after" % (pri, sec)
            rv = self.registry.get(rkey)
            if rv != None:
                return self.expand_timespec(rv)
            else:
                rv = self.registry.get("index.purge_after") # global fallback
                if rv != None:
                    return self.expand_timespec(rv)
        return self.expand_timespec("270d")  # hardcoded default
    
    def L(self, msg):
        caller =  ".".join([str(__name__), sys._getframe(1).f_code.co_name])
        if self.debug != None:
            print caller + ": " + msg
        else:
            self.log.L(caller + ": " + msg)
Beispiel #4
0
class Indexer(object):
    """


    """
    def __init__ (self, connectionPool, index_type, num_servers = 1, table_batch_size = 1000, debug = 0):
        self.debug = debug
        print "indexer connect"
        self.pool = connectionPool
        print "indexer load primary index map"
        self.primary_index = PrimaryIndex(connectionPool, debug)
        print "index init log"
        self.log = Log(connectionPool)
        
        self.num_servers = num_servers
        self.packers = {}
        
        for packer in self.primary_index.names():
            try:
                package='DB.PrimaryIndex.PackUnpack'
                self.L("loading packer " + package + "." + packer)
                __import__(package + "." + packer)
                pkg = sys.modules[package + "." + packer]
                self.packers[packer] = getattr(pkg, packer)
            except ImportError as e:
                self.L("warning: failed to load " + packer)
                    
        with self.pool.connection() as dbh:
            t = dbh.tables()
            
            self.table_name = "index_" + index_type
            
            if not self.table_name in t:
                self.L("index table %s doesnt exist, creating it" % (self.table_name))
                dbh.create_table(self.table_name, {'b': {'COMPRESSION': 'SNAPPY'}})
            
            table_batch_size = 5
            
            self.table = dbh.table(self.table_name).batch(batch_size=table_batch_size)
            self.co_table = dbh.table("cif_objs").batch(batch_size=table_batch_size)
            
            self.reset()
            self.md5 = hashlib.md5()
            self.salt = Salt(self.num_servers, self.debug)
    
    def L(self, msg):
        caller =  ".".join([str(__name__), sys._getframe(1).f_code.co_name])
        if self.debug != None:
            print caller + ": " + msg
        else:
            self.log.L(caller + ": " + msg)
            
    def pack_rowkey_ipv4(self, salt, addr):
        return struct.pack(">HB", self.salt.next(), self.TYPE_IPV4()) + self.packers['ipv4'].pack(addr)

    def pack_rowkey_ipv6(self, salt, addr):
        return struct.pack(">HB", self.salt.next(), self.TYPE_IPV6()) + self.packers['ipv6'].pack(addr)
    
    def pack_rowkey_fqdn(self, salt, fqdn):
        return struct.pack(">HB", self.salt.next(), self.TYPE_FQDN()) + self.packers['domain'].pack(fqdn)
    
    def pack_rowkey_url(self, salt, url):
        return struct.pack(">HB", self.salt.next(), self.TYPE_URL()) + self.packers['url'].pack(url)

    def pack_rowkey_email(self, salt, email):
        return struct.pack(">HB", self.salt.next(), self.TYPE_URL()) + self.packers['email'].pack(email)
    
    def pack_rowkey_search(self, salt, search):
        return struct.pack(">HB", self.salt.next(), self.TYPE_SEARCH()) + self.packers['search'].pack(search) 
    
    def pack_rowkey_malware(self, salt, malware_hash):
        return struct.pack(">HB", self.salt.next(), self.TYPE_MALWARE()) + self.packers['malware'].pack(malware_hash) 
    
    def pack_rowkey_asn(self, salt, asn):
        return struct.pack(">HB", self.salt.next(), self.TYPE_ASN()) + self.packers['asn'].pack(asn) 
    
    def reset(self):
        self.empty = True
        self.addr = None
        self.rowkey = None
        self.confidence = None
        self.addr_type = None
        self.iodef_rowkey = None
    
    def commit(self):
        """
        Commit the record to the index_* table
        Update cif_objs(rowkey=self.iodef_rowkey) so that 'b:{self.table_name}_{self.rowkey}' = 1
        Purger will remove the reference when this feed record is purged.
        
        With hbase, you can put an addt'l cell value into a table/row without having to 
        merge. Existing cells won't be affected.
        """
        try:
            rowdict =      {
                                'b:confidence': str(self.confidence),
                                'b:addr_type': str(self.addr_type),
                                'b:iodef_rowkey': str(self.iodef_rowkey)
                            }
            
            self.table.put(self.rowkey, rowdict)
            fmt = "%ds" % (len(self.table_name) + 4)
            prk = struct.pack(fmt, "cf:" + str(self.table_name) + "_") + self.rowkey            
            self.co_table.put(self.iodef_rowkey, { prk: "1" })
            
        except Exception as e:
            self.L("failed to put record to %s table: " % self.table_name)
            print e
        
        self.reset()

            
    def extract(self, iodef_rowkey, iodef):
        """
        FIX atm this is iodef specific. ideally we will be able to index other document types
        """
        self.reset()

        self.iodef_rowkey = iodef_rowkey
        
        self.md5.update(iodef.SerializeToString())
        self.hash = self.md5.digest()
    
        ii = iodef.Incident[0]
        
        #print ii
        
        self.confidence = ii.Assessment[0].Confidence.content
        self.severity = ii.Assessment[0].Impact[0].severity
        
        # for malware hashes, they appear at the top level for now
        # iodef.incident[].additionaldata.meaning = "malware hash"
        # iodef.incident[].additionaldata.content = "[the hash]"
        
        if hasattr(ii, 'AdditionalData'):
            for ed in ii.AdditionalData:
                #print "ED ", ed
                if ed.meaning == "malware hash":
                    self.L("\tIndexing for malware hash")
                    self.rowkey = self.pack_rowkey_malware(self.salt.next(), ed.content)
                    self.commit()
        
        # addresses and networks are in the EventData[].Flow[].System[] tree
        
        if len(ii.EventData) > 0 or hasattr(ii, 'EventData'):
            
            for ed in ii.EventData:
                for fl in ed.Flow:
                    for sy in fl.System:
                        for i in sy.Node.Address:
                            self.addr_type = i.category
                            
                            if self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_ipv4_addr or self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_ipv4_net:
                                self.addr = i.content
                                self.rowkey = self.pack_rowkey_ipv4(self.salt.next(), self.addr)
                                self.L("Indexing for ipv4")
                                
                                self.commit()
                                
                            # ipv6 addresses and networks
                            
                            elif self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_ipv6_addr or self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_ipv6_net:
                                self.addr = i.content
                                self.rowkey = self.pack_rowkey_ipv6(self.salt.next(), self.addr)
                                self.L("Indexing for ipv6")
                                
                                self.commit()
                            
                            elif self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_asn:
                                self.addr = i.content
                                self.rowkey = self.pack_rowkey_ipv6(self.salt.next(), self.addr)
                                self.L("Indexing for ASN")
                                
                                self.commit()
                            
                            elif self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_ext_value:
                                if i.ext_category == "fqdn":
                                    self.fqdn = i.content
                                    self.rowkey = self.pack_rowkey_fqdn(self.salt.next(), self.fqdn)
                                    self.L("Indexing for FQDDN")
                                    
                                    self.commit()
                                    
                                elif i.ext_category == "url":
                                    self.rowkey = self.pack_rowkey_url(self.salt.next(), i.content)
                                    self.L("Indexing for URL")
                                    self.commit()
                                
                                else:
                                    e = self.primary_index.enum(i.ext_category)
                                    if len(e) > 0:
                                        self.rowkey = struct.pack(">HB", self.salt.next(), e[0]) + self.packers[i.ext_category].pack(i.content) 
                                        self.commit()
                                    else:
                                        self.L("Unknown primary index given " + i.ext_category)
                                    
                            else:
                                print "unhandled category: ", i
                    
    def TYPE_IPV4(self):
        return self.primary_index.enum('ipv4')
    
    def TYPE_IPV6(self):
        return self.primary_index.enum('ipv6')
    
    def TYPE_FQDN(self):
        return self.primary_index.enum('domain')
    
    def TYPE_URL(self):
        return self.primary_index.enum('url')
    
    def TYPE_EMAIL(self):
        return self.primary_index.enum('email')
    
    def TYPE_SEARCH(self):
        return self.primary_index.enum('search')
    
    def TYPE_MALWARE(self):
        return self.primary_index.enum('malware')
    
    def TYPE_ASN(self):
        return self.primary_index.enum('asn')