def __init__(self, connectionPool, num_servers = 1, thread_tracker = None, debug = 0): self.debug = debug self.pool = connectionPool self.log = Log(connectionPool) self.L("cif-db Purger initializing") if thread_tracker == None: raise Exception("thread_tracker parameter can not be None") self.thread_tracker = thread_tracker self.registry = Registry(connectionPool, debug) self.primary_index = PrimaryIndex(connectionPool) self.secondary_index = SecondaryIndex(connectionPool) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.purge_every = self.expand_timespec(self.registry.get('index.purge_every')) if self.purge_every == None: self.purge_every = 24 * 60 * 60 self.L("Purger will run every " + str(self.purge_every) + " seconds") self.prisecmap = [] for i in self.registry.get(): m = re.match(r'^index\.([^\.]+)\.([^\.]+)\.purge_after', i) if m != None: self.prisecmap[m.group(1)][m.group(2)] = self.expand_timespec(self.registry.get(i)) self.workers = [] for server in range(0, self.num_servers): thr_title = "Purger daemon %d of %d" % (server, self.num_servers-1) worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,)) self.workers.append(worker_thr) worker_thr.daemon = True worker_thr.start() while not worker_thr.isAlive(): self.log.L("waiting for purger/worker thread to become alive") time.sleep(1) self.L(thr_title) self.thread_tracker.add(id=worker_thr.ident, user='******', host=socket.gethostname(), state='Running', info=thr_title)
def __init__ (self, connectionPool, index_type, num_servers = 1, table_batch_size = 1000, debug = 0): self.debug = debug print "indexer connect" self.pool = connectionPool print "indexer load primary index map" self.primary_index = PrimaryIndex(connectionPool, debug) print "index init log" self.log = Log(connectionPool) self.num_servers = num_servers self.packers = {} for packer in self.primary_index.names(): try: package='DB.PrimaryIndex.PackUnpack' self.L("loading packer " + package + "." + packer) __import__(package + "." + packer) pkg = sys.modules[package + "." + packer] self.packers[packer] = getattr(pkg, packer) except ImportError as e: self.L("warning: failed to load " + packer) with self.pool.connection() as dbh: t = dbh.tables() self.table_name = "index_" + index_type if not self.table_name in t: self.L("index table %s doesnt exist, creating it" % (self.table_name)) dbh.create_table(self.table_name, {'b': {'COMPRESSION': 'SNAPPY'}}) table_batch_size = 5 self.table = dbh.table(self.table_name).batch(batch_size=table_batch_size) self.co_table = dbh.table("cif_objs").batch(batch_size=table_batch_size) self.reset() self.md5 = hashlib.md5() self.salt = Salt(self.num_servers, self.debug)
class Purger(object): """ Eventually, this will submit map reduce jobs. Since we have to do what amounts to full table scans, that's the best way to do it using hadoop. For this POC, it doesn't use MR. outline: load index.* registry values index.purge_every tells us how long to sleep for between MR submissions index.primary.secondary.purge_after tells us the max age of records we'll keep index.purge_after is the default if no pri.sec is specified eg index.purge_every = 24h index.purge_after = 7d index.infrastructure.botnet.purge_after = 10d spawn a thread per server record them in threadtracker """ def __init__(self, connectionPool, num_servers = 1, thread_tracker = None, debug = 0): self.debug = debug self.pool = connectionPool self.log = Log(connectionPool) self.L("cif-db Purger initializing") if thread_tracker == None: raise Exception("thread_tracker parameter can not be None") self.thread_tracker = thread_tracker self.registry = Registry(connectionPool, debug) self.primary_index = PrimaryIndex(connectionPool) self.secondary_index = SecondaryIndex(connectionPool) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.purge_every = self.expand_timespec(self.registry.get('index.purge_every')) if self.purge_every == None: self.purge_every = 24 * 60 * 60 self.L("Purger will run every " + str(self.purge_every) + " seconds") self.prisecmap = [] for i in self.registry.get(): m = re.match(r'^index\.([^\.]+)\.([^\.]+)\.purge_after', i) if m != None: self.prisecmap[m.group(1)][m.group(2)] = self.expand_timespec(self.registry.get(i)) self.workers = [] for server in range(0, self.num_servers): thr_title = "Purger daemon %d of %d" % (server, self.num_servers-1) worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,)) self.workers.append(worker_thr) worker_thr.daemon = True worker_thr.start() while not worker_thr.isAlive(): self.log.L("waiting for purger/worker thread to become alive") time.sleep(1) self.L(thr_title) self.thread_tracker.add(id=worker_thr.ident, user='******', host=socket.gethostname(), state='Running', info=thr_title) def expand_timespec(self, tspec): """ accepts: \d[dwh] and returns seconds """ if tspec == None: return None m = re.match(r"^(\d+)([dwh])$", tspec) if m == None: self.L("invalid timespec: " + tspec) return None if m.group(2) == "d": return int(m.group(1)) * 24 * 60 * 60 if m.group(2) == "w": return int(m.group(1)) * 7 * 24 * 60 * 60 if m.group(2) == "h": return int(m.group(1)) * 60 * 60 def remove_index_and_dereference(self, index_th, index_rowkey, co_tbl, index_table, document_rowkey): try: index_th.delete(index_rowkey) co_row = co_tbl.row(document_rowkey) fmt = "%ds" % (len(index_table) + 4) # Also in Indexer prk = struct.pack(fmt, "cf:" + str(index_table) + "_") + document_rowkey if prk in co_row: co_tbl.delete(document_rowkey, columns=[prk]) except Exception as e: self.L("Failed to delete reference and index: " + index_table + str(e) + traceback.format_exc(None)) def run(self, server): """ thread: forever: foreach sec: # eg botnet, phishing, whitelist foreach pri: # eg ipv4 ipv6 url submit purge job(pri/sec) record pri in a pri_list submit purge job(difference of the sets all_pris and pri_list / sec) """ with self.pool.connection() as dbh: secondaries = set(self.secondary_index.names()) primaries = set(self.primary_index.names()) while True: pri_done = [] for sec in secondaries: for pri in primaries: if self.primary_index.is_group(pri) == False: self.submit_purge_job(dbh, pri, sec) pri_done.append(pri) # remove groups too # pri_done is a subset of primaries diff = primaries - set(pri_done) if len(diff) > 0: self.submit_purge_job(dbh, diff, sec) time.sleep(self.purge_every) self.L("Purger awake after " + str(self.purge_every) + " seconds") def submit_purge_job(self, dbh, pri, sec): """ future: submit a MR job current: just iterate FIX atm this is iodef specific, ideally we will handle other document types """ self.L("begin purge of %s/%s" % (pri, sec)) tables = dbh.tables() table_name = "index_" + sec if table_name in tables: tbl = dbh.table("index_" + sec) co_tbl = dbh.table("cif_objs") for i in range(0, self.num_servers): self.L("purging index_%s on server %d" %(sec, i)) pri_enum = self.primary_index.enum(pri) if pri_enum != None: rowpre = struct.pack(">HB", i, pri_enum) oldest_allowed = self.lookup_max_lifespan(pri, sec) for key, data in tbl.scan(row_prefix=rowpre, include_timestamp=True): document_rowkey = None data_age = None if 'b:iodef_rowkey' in data: # iodef handler data_age = data['b:iodef_rowkey'][1] document_rowkey = data['b:iodef_rowkey'][0] #elif 'b:stiix_rowkey' in data: ... etc if time.time() - data_age < oldest_allowed: # cif_objs.row(iodef_rowkey) will contain a column "cf:index_$sec_$thisrowkey" we want to delete that reference self.remove_index_and_dereference(tbl, key, co_tbl, table_name, document_rowkey) def lookup_max_lifespan(self, pri, sec): return 86400 if pri != None and sec != None: # index.$pri.$sec.purge_after rkey = "index.%s.%s.purge_after" % (pri, sec) rv = self.registry.get(rkey) if rv != None: return self.expand_timespec(rv) else: rv = self.registry.get("index.purge_after") # global fallback if rv != None: return self.expand_timespec(rv) return self.expand_timespec("270d") # hardcoded default def L(self, msg): caller = ".".join([str(__name__), sys._getframe(1).f_code.co_name]) if self.debug != None: print caller + ": " + msg else: self.log.L(caller + ": " + msg)
class Indexer(object): """ """ def __init__ (self, connectionPool, index_type, num_servers = 1, table_batch_size = 1000, debug = 0): self.debug = debug print "indexer connect" self.pool = connectionPool print "indexer load primary index map" self.primary_index = PrimaryIndex(connectionPool, debug) print "index init log" self.log = Log(connectionPool) self.num_servers = num_servers self.packers = {} for packer in self.primary_index.names(): try: package='DB.PrimaryIndex.PackUnpack' self.L("loading packer " + package + "." + packer) __import__(package + "." + packer) pkg = sys.modules[package + "." + packer] self.packers[packer] = getattr(pkg, packer) except ImportError as e: self.L("warning: failed to load " + packer) with self.pool.connection() as dbh: t = dbh.tables() self.table_name = "index_" + index_type if not self.table_name in t: self.L("index table %s doesnt exist, creating it" % (self.table_name)) dbh.create_table(self.table_name, {'b': {'COMPRESSION': 'SNAPPY'}}) table_batch_size = 5 self.table = dbh.table(self.table_name).batch(batch_size=table_batch_size) self.co_table = dbh.table("cif_objs").batch(batch_size=table_batch_size) self.reset() self.md5 = hashlib.md5() self.salt = Salt(self.num_servers, self.debug) def L(self, msg): caller = ".".join([str(__name__), sys._getframe(1).f_code.co_name]) if self.debug != None: print caller + ": " + msg else: self.log.L(caller + ": " + msg) def pack_rowkey_ipv4(self, salt, addr): return struct.pack(">HB", self.salt.next(), self.TYPE_IPV4()) + self.packers['ipv4'].pack(addr) def pack_rowkey_ipv6(self, salt, addr): return struct.pack(">HB", self.salt.next(), self.TYPE_IPV6()) + self.packers['ipv6'].pack(addr) def pack_rowkey_fqdn(self, salt, fqdn): return struct.pack(">HB", self.salt.next(), self.TYPE_FQDN()) + self.packers['domain'].pack(fqdn) def pack_rowkey_url(self, salt, url): return struct.pack(">HB", self.salt.next(), self.TYPE_URL()) + self.packers['url'].pack(url) def pack_rowkey_email(self, salt, email): return struct.pack(">HB", self.salt.next(), self.TYPE_URL()) + self.packers['email'].pack(email) def pack_rowkey_search(self, salt, search): return struct.pack(">HB", self.salt.next(), self.TYPE_SEARCH()) + self.packers['search'].pack(search) def pack_rowkey_malware(self, salt, malware_hash): return struct.pack(">HB", self.salt.next(), self.TYPE_MALWARE()) + self.packers['malware'].pack(malware_hash) def pack_rowkey_asn(self, salt, asn): return struct.pack(">HB", self.salt.next(), self.TYPE_ASN()) + self.packers['asn'].pack(asn) def reset(self): self.empty = True self.addr = None self.rowkey = None self.confidence = None self.addr_type = None self.iodef_rowkey = None def commit(self): """ Commit the record to the index_* table Update cif_objs(rowkey=self.iodef_rowkey) so that 'b:{self.table_name}_{self.rowkey}' = 1 Purger will remove the reference when this feed record is purged. With hbase, you can put an addt'l cell value into a table/row without having to merge. Existing cells won't be affected. """ try: rowdict = { 'b:confidence': str(self.confidence), 'b:addr_type': str(self.addr_type), 'b:iodef_rowkey': str(self.iodef_rowkey) } self.table.put(self.rowkey, rowdict) fmt = "%ds" % (len(self.table_name) + 4) prk = struct.pack(fmt, "cf:" + str(self.table_name) + "_") + self.rowkey self.co_table.put(self.iodef_rowkey, { prk: "1" }) except Exception as e: self.L("failed to put record to %s table: " % self.table_name) print e self.reset() def extract(self, iodef_rowkey, iodef): """ FIX atm this is iodef specific. ideally we will be able to index other document types """ self.reset() self.iodef_rowkey = iodef_rowkey self.md5.update(iodef.SerializeToString()) self.hash = self.md5.digest() ii = iodef.Incident[0] #print ii self.confidence = ii.Assessment[0].Confidence.content self.severity = ii.Assessment[0].Impact[0].severity # for malware hashes, they appear at the top level for now # iodef.incident[].additionaldata.meaning = "malware hash" # iodef.incident[].additionaldata.content = "[the hash]" if hasattr(ii, 'AdditionalData'): for ed in ii.AdditionalData: #print "ED ", ed if ed.meaning == "malware hash": self.L("\tIndexing for malware hash") self.rowkey = self.pack_rowkey_malware(self.salt.next(), ed.content) self.commit() # addresses and networks are in the EventData[].Flow[].System[] tree if len(ii.EventData) > 0 or hasattr(ii, 'EventData'): for ed in ii.EventData: for fl in ed.Flow: for sy in fl.System: for i in sy.Node.Address: self.addr_type = i.category if self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_ipv4_addr or self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_ipv4_net: self.addr = i.content self.rowkey = self.pack_rowkey_ipv4(self.salt.next(), self.addr) self.L("Indexing for ipv4") self.commit() # ipv6 addresses and networks elif self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_ipv6_addr or self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_ipv6_net: self.addr = i.content self.rowkey = self.pack_rowkey_ipv6(self.salt.next(), self.addr) self.L("Indexing for ipv6") self.commit() elif self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_asn: self.addr = i.content self.rowkey = self.pack_rowkey_ipv6(self.salt.next(), self.addr) self.L("Indexing for ASN") self.commit() elif self.addr_type == RFC5070_IODEF_v1_pb2.AddressType.Address_category_ext_value: if i.ext_category == "fqdn": self.fqdn = i.content self.rowkey = self.pack_rowkey_fqdn(self.salt.next(), self.fqdn) self.L("Indexing for FQDDN") self.commit() elif i.ext_category == "url": self.rowkey = self.pack_rowkey_url(self.salt.next(), i.content) self.L("Indexing for URL") self.commit() else: e = self.primary_index.enum(i.ext_category) if len(e) > 0: self.rowkey = struct.pack(">HB", self.salt.next(), e[0]) + self.packers[i.ext_category].pack(i.content) self.commit() else: self.L("Unknown primary index given " + i.ext_category) else: print "unhandled category: ", i def TYPE_IPV4(self): return self.primary_index.enum('ipv4') def TYPE_IPV6(self): return self.primary_index.enum('ipv6') def TYPE_FQDN(self): return self.primary_index.enum('domain') def TYPE_URL(self): return self.primary_index.enum('url') def TYPE_EMAIL(self): return self.primary_index.enum('email') def TYPE_SEARCH(self): return self.primary_index.enum('search') def TYPE_MALWARE(self): return self.primary_index.enum('malware') def TYPE_ASN(self): return self.primary_index.enum('asn')