def __init__ (self, connectionPool, p_index, s_index, debug): self.debug = debug self.primary_index = p_index self.secondary_index = s_index self.pool = connectionPool try: self.registry = Registry(connectionPool, debug) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.packers = {} for packer in self.primary_index.names(): try: package='DB.PrimaryIndex.PackUnpack' self.L("loading packer " + package + "." + packer) __import__(package + "." + packer) pkg = sys.modules[package + "." + packer] self.packers[packer] = getattr(pkg, packer) except ImportError as e: self.L("warning: failed to load " + packer) with self.pool.connection() as dbh: self.tbl_co = dbh.table('cif_objs') self.available_tables = dbh.tables() except Exception as e: self.L("failed to open tables") print e raise
class Log(object): def __init__ (self, connectionPool, myhost = None, debug = 0): self.debug = debug self.pool = connectionPool if myhost != None: self.myhost = myhost else: self.myhost = socket.gethostname() self.registry = Registry(connectionPool, debug) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.salt = Salt(self.num_servers, self.debug) def L(self, msg): self.salt.next() try: rowkey = struct.pack(">HI", self.salt.next(), int(time.time())) rowdict = { 'b:hostname': str(self.myhost), 'b:msg': str(msg) } with self.pool.connection() as connection: connection.table('log').put(rowkey, rowdict) except Exception as e: print "failed to put record to 'log' table: ", rowdict
def __init__ (self, connectionPool, debug=0): self.debug = debug self.pool = connectionPool self.registry = Registry(connectionPool, debug) self.index_to_enum = {} # name -> enum self.enum_to_index = {} # enum -> name self.load_primary_index_map()
def __init__ (self, connectionPool, debug=0): self.debug = debug self.pool = connectionPool self.registry = Registry(connectionPool, debug) self.names_list = [] self.names_dict = {} self.load_secondary_index_map()
def __init__ (self, connectionPool, myhost = None, debug = 0): self.debug = debug self.pool = connectionPool if myhost != None: self.myhost = myhost else: self.myhost = socket.gethostname() self.registry = Registry(connectionPool, debug) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.salt = Salt(self.num_servers, self.debug)
def __init__ (self, hbhost, debug): self.debug = debug self.dbh = happybase.Connection(hbhost) t = self.dbh.tables() self.table = self.dbh.table('infrastructure_botnet') self.kickit = threading.Semaphore(0) self.proc_thread = threading.Thread(target=self.run, args=()) self.proc_thread.start() self.botnet_handler = Botnet.Botnet(self.dbh, debug) self.registry = Registry(hbhost, debug) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.salt = Salt(self.num_servers, self.debug)
def __init__(self, connectionPool, num_servers = 1, thread_tracker = None, debug = 0): self.debug = debug self.pool = connectionPool self.log = Log(connectionPool) self.L("cif-db Purger initializing") if thread_tracker == None: raise Exception("thread_tracker parameter can not be None") self.thread_tracker = thread_tracker self.registry = Registry(connectionPool, debug) self.primary_index = PrimaryIndex(connectionPool) self.secondary_index = SecondaryIndex(connectionPool) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.purge_every = self.expand_timespec(self.registry.get('index.purge_every')) if self.purge_every == None: self.purge_every = 24 * 60 * 60 self.L("Purger will run every " + str(self.purge_every) + " seconds") self.prisecmap = [] for i in self.registry.get(): m = re.match(r'^index\.([^\.]+)\.([^\.]+)\.purge_after', i) if m != None: self.prisecmap[m.group(1)][m.group(2)] = self.expand_timespec(self.registry.get(i)) self.workers = [] for server in range(0, self.num_servers): thr_title = "Purger daemon %d of %d" % (server, self.num_servers-1) worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,)) self.workers.append(worker_thr) worker_thr.daemon = True worker_thr.start() while not worker_thr.isAlive(): self.log.L("waiting for purger/worker thread to become alive") time.sleep(1) self.L(thr_title) self.thread_tracker.add(id=worker_thr.ident, user='******', host=socket.gethostname(), state='Running', info=thr_title)
def __init__(self, connectionPool, thread_tracker, debug): self.debug = debug self.pool = connectionPool if thread_tracker == None: raise Exception("thread_tracker parameter can not be None") self.thread_tracker = thread_tracker self.registry = Registry(connectionPool, debug) self.num_servers = self.registry.get("hadoop.num_servers") if self.num_servers == None: self.num_servers = 1 self.batch_size = self.registry.get("hbase.batch_size") if self.batch_size == None: self.batch_size = 1000 """ We create one exploder thread per hbase server. Each thread has its own hbase connection. foreach server (1 .. numservers) spawn_exploder_thread(server) """ self.workers = [] for server in range(0, self.num_servers): thr_title = "Exploder daemon %d of %d" % (server, self.num_servers - 1) worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,)) self.workers.append(worker_thr) worker_thr.daemon = True worker_thr.start() while not worker_thr.isAlive(): print "waiting for exploder/worker thread to become alive" time.sleep(1) self.thread_tracker.add( id=worker_thr.ident, user="******", host=socket.gethostname(), state="Running", info=thr_title )
class SecondaryIndex(object): """ The secondary index is the table name (index_botnet, index_malware) and corresponds to the second part of the query string. infrastructure/botnet pri = infrastructure (ipv4 and ipv6) sec = botnet """ def __init__ (self, connectionPool, debug=0): self.debug = debug self.pool = connectionPool self.registry = Registry(connectionPool, debug) self.names_list = [] self.names_dict = {} self.load_secondary_index_map() def exists(self, name): if name in self.names_dict: return True return False def names(self): return self.names_list def load_secondary_index_map(self): siv = self.registry.get('index.secondary') if siv != None: self.names_list = [] self.names_dict = {} for i in re.split(',', siv): n = i.lstrip().rstrip() self.names_list.append(n) self.names_dict[n] = 1
class Purger(object): """ Eventually, this will submit map reduce jobs. Since we have to do what amounts to full table scans, that's the best way to do it using hadoop. For this POC, it doesn't use MR. outline: load index.* registry values index.purge_every tells us how long to sleep for between MR submissions index.primary.secondary.purge_after tells us the max age of records we'll keep index.purge_after is the default if no pri.sec is specified eg index.purge_every = 24h index.purge_after = 7d index.infrastructure.botnet.purge_after = 10d spawn a thread per server record them in threadtracker """ def __init__(self, connectionPool, num_servers = 1, thread_tracker = None, debug = 0): self.debug = debug self.pool = connectionPool self.log = Log(connectionPool) self.L("cif-db Purger initializing") if thread_tracker == None: raise Exception("thread_tracker parameter can not be None") self.thread_tracker = thread_tracker self.registry = Registry(connectionPool, debug) self.primary_index = PrimaryIndex(connectionPool) self.secondary_index = SecondaryIndex(connectionPool) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.purge_every = self.expand_timespec(self.registry.get('index.purge_every')) if self.purge_every == None: self.purge_every = 24 * 60 * 60 self.L("Purger will run every " + str(self.purge_every) + " seconds") self.prisecmap = [] for i in self.registry.get(): m = re.match(r'^index\.([^\.]+)\.([^\.]+)\.purge_after', i) if m != None: self.prisecmap[m.group(1)][m.group(2)] = self.expand_timespec(self.registry.get(i)) self.workers = [] for server in range(0, self.num_servers): thr_title = "Purger daemon %d of %d" % (server, self.num_servers-1) worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,)) self.workers.append(worker_thr) worker_thr.daemon = True worker_thr.start() while not worker_thr.isAlive(): self.log.L("waiting for purger/worker thread to become alive") time.sleep(1) self.L(thr_title) self.thread_tracker.add(id=worker_thr.ident, user='******', host=socket.gethostname(), state='Running', info=thr_title) def expand_timespec(self, tspec): """ accepts: \d[dwh] and returns seconds """ if tspec == None: return None m = re.match(r"^(\d+)([dwh])$", tspec) if m == None: self.L("invalid timespec: " + tspec) return None if m.group(2) == "d": return int(m.group(1)) * 24 * 60 * 60 if m.group(2) == "w": return int(m.group(1)) * 7 * 24 * 60 * 60 if m.group(2) == "h": return int(m.group(1)) * 60 * 60 def remove_index_and_dereference(self, index_th, index_rowkey, co_tbl, index_table, document_rowkey): try: index_th.delete(index_rowkey) co_row = co_tbl.row(document_rowkey) fmt = "%ds" % (len(index_table) + 4) # Also in Indexer prk = struct.pack(fmt, "cf:" + str(index_table) + "_") + document_rowkey if prk in co_row: co_tbl.delete(document_rowkey, columns=[prk]) except Exception as e: self.L("Failed to delete reference and index: " + index_table + str(e) + traceback.format_exc(None)) def run(self, server): """ thread: forever: foreach sec: # eg botnet, phishing, whitelist foreach pri: # eg ipv4 ipv6 url submit purge job(pri/sec) record pri in a pri_list submit purge job(difference of the sets all_pris and pri_list / sec) """ with self.pool.connection() as dbh: secondaries = set(self.secondary_index.names()) primaries = set(self.primary_index.names()) while True: pri_done = [] for sec in secondaries: for pri in primaries: if self.primary_index.is_group(pri) == False: self.submit_purge_job(dbh, pri, sec) pri_done.append(pri) # remove groups too # pri_done is a subset of primaries diff = primaries - set(pri_done) if len(diff) > 0: self.submit_purge_job(dbh, diff, sec) time.sleep(self.purge_every) self.L("Purger awake after " + str(self.purge_every) + " seconds") def submit_purge_job(self, dbh, pri, sec): """ future: submit a MR job current: just iterate FIX atm this is iodef specific, ideally we will handle other document types """ self.L("begin purge of %s/%s" % (pri, sec)) tables = dbh.tables() table_name = "index_" + sec if table_name in tables: tbl = dbh.table("index_" + sec) co_tbl = dbh.table("cif_objs") for i in range(0, self.num_servers): self.L("purging index_%s on server %d" %(sec, i)) pri_enum = self.primary_index.enum(pri) if pri_enum != None: rowpre = struct.pack(">HB", i, pri_enum) oldest_allowed = self.lookup_max_lifespan(pri, sec) for key, data in tbl.scan(row_prefix=rowpre, include_timestamp=True): document_rowkey = None data_age = None if 'b:iodef_rowkey' in data: # iodef handler data_age = data['b:iodef_rowkey'][1] document_rowkey = data['b:iodef_rowkey'][0] #elif 'b:stiix_rowkey' in data: ... etc if time.time() - data_age < oldest_allowed: # cif_objs.row(iodef_rowkey) will contain a column "cf:index_$sec_$thisrowkey" we want to delete that reference self.remove_index_and_dereference(tbl, key, co_tbl, table_name, document_rowkey) def lookup_max_lifespan(self, pri, sec): return 86400 if pri != None and sec != None: # index.$pri.$sec.purge_after rkey = "index.%s.%s.purge_after" % (pri, sec) rv = self.registry.get(rkey) if rv != None: return self.expand_timespec(rv) else: rv = self.registry.get("index.purge_after") # global fallback if rv != None: return self.expand_timespec(rv) return self.expand_timespec("270d") # hardcoded default def L(self, msg): caller = ".".join([str(__name__), sys._getframe(1).f_code.co_name]) if self.debug != None: print caller + ": " + msg else: self.log.L(caller + ": " + msg)
class PrimaryIndex(object): """ The primary index is the first part of the query string. Eg. "infrastructure" or "url". This corresponds to the third byte of the hbase rowkey. We allow for groups in the primary index. For example, ipv4 = 0 ipv6 = 1 infrastructure = ipv4,ipv6 """ def __init__ (self, connectionPool, debug=0): self.debug = debug self.pool = connectionPool self.registry = Registry(connectionPool, debug) self.index_to_enum = {} # name -> enum self.enum_to_index = {} # enum -> name self.load_primary_index_map() def names(self): """ Return all of the primary index names, including group names. """ return self.index_to_enum.keys() def is_group(self, name): """ If the given name is a group, return True else False """ if name in self.index_to_enum: v = self.index_to_enum[name] if type(v) is not int: return True return False def reduce_group(self, name): """ If the given name is a group, return [group member names] else return [name] """ if name in self.index_to_enum: v = self.index_to_enum[name] if type(v) is int: return [name] rv = [] for innername in re.split(',', self.index_to_enum[name]): rv.append(innername.lstrip().rstrip()) return rv def enum(self, name): """ Return the enum value(s) for the given primary index name. This function returns a list. In the case where the given index name is a group, multiple enum values will be returned. """ enums = [] if name in self.index_to_enum: v = self.index_to_enum[name] if type(v) is int: return v else: for innername in re.split(',', v): enums.append(self.enum(innername.lstrip().rstrip())) return enums def name(self, enum): """ Given an index enumeration value, return the name of the index """ if enum in self.enum_to_index: return self.enum_to_index[enum] return None def load_primary_index_map(self): for reg_key in self.registry.get(): reg_val = self.registry.get(reg_key) if re.match('^index.primary.', reg_key): x = re.split('\.', reg_key) self.index_to_enum[x[2]] = reg_val if type(reg_val) is int: self.enum_to_index[reg_val] = x[2]
class Query(object): def __init__ (self, connectionPool, p_index, s_index, debug): self.debug = debug self.primary_index = p_index self.secondary_index = s_index self.pool = connectionPool try: self.registry = Registry(connectionPool, debug) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.packers = {} for packer in self.primary_index.names(): try: package='DB.PrimaryIndex.PackUnpack' self.L("loading packer " + package + "." + packer) __import__(package + "." + packer) pkg = sys.modules[package + "." + packer] self.packers[packer] = getattr(pkg, packer) except ImportError as e: self.L("warning: failed to load " + packer) with self.pool.connection() as dbh: self.tbl_co = dbh.table('cif_objs') self.available_tables = dbh.tables() except Exception as e: self.L("failed to open tables") print e raise def L(self, msg): caller = ".".join([str(__name__), sys._getframe(1).f_code.co_name]) if self.debug != None and self.debug > 0: print caller + ": " + msg else: syslog.syslog(caller + ": " + msg) def decode_query(self, qstring): """ Given a query string, return a dictionary containing: { 'primary' : [INTEGER COUPLE], 'prinames' : [STRING COUPLE], 'secondary' : STRING, 'limiter' : { 'type' : INTEGER, 'value' : STR } } eg: (infra = ipv4, ipv6 = 0, 1) infrastructure/botnet { 'primary' : [0,1], 'secondary' : 'botnet', 'limiter' : None } infrastructure/botnet,10.10.0.0/16 { 'primary' : [0,1], 'secondary' : 'botnet', 'limiter' : { 'type' : 0, 'value' : '10.10.0.0/16' } } Where 'type', above, is a guess based on the types of things we expect to be queried for: IP addresses, domain names, email addresses, URLs What can we do with this? We can open the correct secondary index table. We can pack the rowkey based on the primary index. If the primary index is a couple, we set a start and stop rowkey. If it's only a single value, we use it as a row prefix. If we have a limiter, we pack it based on its type. """ rv = {} if re.match(r'^[a-z0-9]+/[a-z0-9]+$', qstring, flags=re.IGNORECASE): # "primary/secondary" only indexparts = re.split('/', qstring) if len(indexparts) != 2: raise Exception("Query prefix not in the form of index1/index2") pi_enum = self.primary_index.enum(indexparts[0]) if type(pi_enum) is int: pi_enum = [pi_enum] # primary was not a group, so we only got a single enum back if len(pi_enum) > 0 and self.secondary_index.exists(indexparts[1]) == True: rv['primary'] = pi_enum rv['prinames'] = self.primary_index.reduce_group(indexparts[0]) rv['secondary'] = indexparts[1] rv['limiter'] = { 'type' : None, 'value' : None } elif re.match(r'^[a-z0-9]+/[a-z0-9]+,', qstring, flags=re.IGNORECASE): # "primary/secondary,limiter" both specified qparts = re.split(',', qstring) if len(qparts) > 2: qparts[1] = qparts[1:].join('') del qparts[2:] indexparts = re.split('/', qparts[0]) if len(indexparts) != 2: raise "Query prefix not in the form of index1/index2" pi_enum = self.primary_index.enum(indexparts[0]) if type(pi_enum) is int: pi_enum = [pi_enum] # primary was not a group, so we only got a single enum back limit_enum = self.guesstypeof(qparts[1]) # make sure they didn't give us, eg, an email limiter for a ipv4 primary index if not limit_enum in pi_enum: raise Exception("Limiter mismatched with primary index") pi_enum = [limit_enum] if len(pi_enum) > 0 and self.secondary_index.exists(indexparts[1]) == True: rv['primary'] = pi_enum rv['prinames'] = self.primary_index.name(limit_enum) rv['secondary'] = indexparts[1] rv['limiter'] = { 'type' : self.guesstypeof(qparts[1]), 'value' : qparts[1] } else: # "limiter" only specified rv['primary'] = [self.guesstypeof(qstring)] rv['prinames'] = self.primary_index.name(self.guesstypeof(qstring)) rv['secondary'] = None rv['limiter'] = { 'type' : self.guesstypeof(qstring), 'value' : qstring } return rv def guesstypeof(self, s): """ Try to figure out which primary index apply to the given string. ipv4, ipv6, url, email, domain This information is useful when we get a limiter with no pri/sec hints. So if the query is "10.10.0.0" we want to know that it's an ipv4 address so we can construct the start and stop rowkey appropriately. """ try: ipv = IP(s).version() if ipv == 4: return self.primary_index.enum('ipv4') if ipv == 6: return self.primary_index.enum('ipv6') except ValueError as e: try: o = urlparse(s) # a hash, eg 10299abe93984f8e8d8e9f if o.scheme == '' and re.match(r'^[0-9a-f]+$', o.path, flags=re.IGNORECASE) != None: return self.primary_index.enum('malware') # an email, [email protected] if o.scheme == '' and re.search(r'@', o.path) != None: return self.primary_index.enum('email') # a domainname if o.scheme == '' and re.match(r'^[a-zA-Z0-9][a-zA-Z0-9-]{1,61}[a-zA-Z0-9]\.[a-zA-Z]{2,}$', o.path) != None: return self.primary_index.enum('domain') # a url if o.scheme != '': return self.primary_index.enum('url') # an asn if o.scheme == '' and re.match(r'^[\d+]$', o.path) != None: return self.primary_index.enum('asn') except ValueError as e: return self.primary_index.enum('search') return self.primary_index.enum('search') def setqr(self, qr): self.qr = qr def ipv4_to_start_end_ints(self, v4): """ Given (possibly) a cidr block, return the start addr and end addr (plus one) as ints. if no mask given, end = start """ p = v4.split('/') octs = p[0].split('.') if len(octs) != 4: self.L ("ipv4_to_start_end_ints: Invalid parameter: " + str(v4)) return 0 for i in range(0,4): octs[i] = int(octs[i]) start = octs[0] << 24 | octs[1] << 16 | octs[2] << 8 | octs[3] if len(p) == 2: maskbits = int(p[1]) if int(maskbits) < 0 or int(maskbits) > 32: self.L ("ipv4_to_start_end_ints: Invalid bitmask: " + maskbits) return 0 mask = 2**maskbits - 1 end = start | mask end = end + 1 else: end = start return [start, end] def setlimit(self, limit): self.limit = limit """ we will fetch up to self.limit records matching the query, pack them into iodef documents, insert them into the QueryResponse and return that. that object (the QR) will be placed back into the control message and sent back to the client from which it came. infra/botnet infra/malware infra/scan domain/botnet domain/malware url/botnet url/malware url/phishing <2 byte salt> ipv4 = 0x0 (infrastructure/botnet) ipv6 = 0x1 (infrastructure/botnet) fqdn = 0x2 (domain/botnet) url = 0x3 (url/botnet) email = 0x4 (email/botnet) search = 0x5 (search/botnet) malware = 0x6 (malware/botnet) asn = 0x7 (asn/botnet) so to query for all infra_botnet, thread out for each salt (one thread per salt val) and append 0x0 or 0x1 if they give a specific netblock or ip, append that as well for domain_botnet, one thread per salt and append 0x2, for a specific domain, append the length (2 bytes) and then the domain """ def execqr(self): self.L("execute query: " + self.qr.query) try: decoded_query = self.decode_query(self.qr.query) # infrastructure/botnet,[email protected] {'limiter': {'type': 4, 'value': u'*****@*****.**'}, 'primary': [0, 1], 'secondary': u'malware'} # result: invalid/mismatched limiter/primary # infrastructure/botnet {'limiter': {'type': None, 'value': None}, 'secondary': u'botnet', 'primary': [0, 1], 'prinames': ['ipv4', 'ipv6']} # result: valid, query index_botnet for all ipv4/ipv6 rows # 10.10.0.0/16 {'limiter': {'type': 0, 'value': u'10.10.0.0/16'}, 'secondary': None, 'primary': None, 'prinames': None} # result: valid, query all secondaries for primary type '0' and pack 10.10.0.0 onto the start and 10.10.255.255 onto the end rowkey # open table index_$secondary # # if len(primary) is 2: # pack start rowkey using primary[0] # pack stop rowkey using primary[1] # else # pack rowprefix using primary[0] # # if we have a limiter, pack it into the end of the rowkey # len(primary) must be 1 if we have a limiter # if stop rowkey != none then use scan(start=,stop=) # else use scan(rowprefix=) secondaries_to_scan = [] if 'secondary' in decoded_query and decoded_query['secondary'] != None: secondaries_to_scan.append(decoded_query['secondary']) else: secondaries = self.registry.get('index.secondary') secondaries_to_scan = re.sub(r'\s*', r'', secondaries).split(',') qrs = control_pb2.QueryResponse() # TODO: spawn a thread for each secondary to scan, coalesce results # TODO: spawn a thread for each salt to scan, coalesce results qrs.ReportTime = datetime.datetime.now().isoformat(' ') qrs.description = self.qr.query with self.pool.connection() as dbh: for server in range(0, self.num_servers): for secondary in secondaries_to_scan: table_name = "index_" + secondary if not table_name in self.available_tables: continue table = dbh.table(table_name) if decoded_query['primary'] != None: if len(decoded_query['primary']) == 1: rowprefix = struct.pack('>HB', server, decoded_query['primary'][0]) # limiter/type and limiter/value are always present but may be None if decoded_query['limiter']['type'] != None: packer = self.primary_index.name(decoded_query['limiter']['type']) # use 'prinames' instead of this lookup rowprefix = rowprefix + self.packers[packer].pack(decoded_query['limiter']['value']) for key, value in table.scan(row_prefix=rowprefix): iodef_rowkey = value['b:iodef_rowkey'] iodef_row = self.tbl_co.row(iodef_rowkey) for key in iodef_row: value = iodef_row[key] if re.match(r'cf:index_', key) == None: bot = (key.split(":"))[1] qrs.baseObjectType.append(bot) qrs.data.append(value) break elif len(decoded_query['primary']) == 2: startrow = struct.pack('>HB', server, decoded_query['primary'][0]) stoprow = struct.pack('>HB', server, decoded_query['primary'][1]) if decoded_query['limiter']['type'] != None: print "limiter given of type " + self.primary_index.name(decoded_query['limiter']['type']) print "we shouldnt get here" for key, value in table.scan(row_start=startrow, row_stop=stoprow): iodef_rowkey = value['b:iodef_rowkey'] iodef_row = self.tbl_co.row(iodef_rowkey) for key in iodef_row: value = iodef_row[key] if re.match(r'cf:index_', key) == None: bot = (key.split(":"))[1] qrs.baseObjectType.append(bot) qrs.data.append(value) break elif decoded_query['primary'] == None: print "no primary given case" print "we shouldnt get here" return qrs except Exception as e: print e traceback.print_exc(file=sys.stdout) raise e
class Exploder(object): def __init__ (self, connectionPool, thread_tracker, debug): self.debug = debug self.pool = connectionPool if thread_tracker == None: raise Exception("thread_tracker parameter can not be None") self.thread_tracker = thread_tracker self.registry = Registry(connectionPool, debug) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.batch_size = self.registry.get('hbase.batch_size') if self.batch_size == None: self.batch_size = 1000 """ We create one exploder thread per hbase server. Each thread has its own hbase connection. foreach server (1 .. numservers) spawn_exploder_thread(server) """ self.workers = [] for server in range(0, self.num_servers): thr_title = "Exploder daemon %d of %d" % (server, self.num_servers-1) worker_thr = threading.Thread(target=self.run, name=thr_title, args=(server,)) self.workers.append(worker_thr) worker_thr.daemon = True worker_thr.start() while not worker_thr.isAlive(): print "waiting for exploder/worker thread to become alive" time.sleep(1) self.thread_tracker.add(id=worker_thr.ident, user='******', host=socket.gethostname(), state='Running', info=thr_title) def L(self, msg): caller = ".".join([str(__name__), sys._getframe(1).f_code.co_name]) if self.debug != None: print caller + ": " + msg else: syslog.syslog(caller + ": " + msg) def do_some_work(self): self.kickit.release() def getcheckpoint(self, salt): t = self.registry.get('exploder.checkpoint.' + str(salt)) if t != None: return t return 0 def setcheckpoint(self, salt, ts): self.registry.set('exploder.checkpoint.' + str(salt), ts) def run(self, salt): """ run(salt) this routine scans the cif_obj db for rows starting at "salt" + last checkpoint timestamp and ending at row "salt" + now() each row read in is passed to the Indexer for indexing. """ self.L("Exploder thread running for salt: " + str(salt)) with self.pool.connection() as dbh: index_handler = {} # Indexer.Indexer(self.dbh, "botnet", self.num_servers, self.debug) while True: co = dbh.table('cif_objs') startts = self.getcheckpoint(salt) endts = int(time.time()) processed = 0 #self.L("processing: " + str(startts) + " to " + str(endts)) if startts == 0: startts = 1 srowid = struct.pack(">HIIIII", salt, startts-1, 0,0,0,0) erowid = struct.pack(">HIIIII", salt, endts, 0,0,0,0) for key, data in co.scan(row_start=srowid, row_stop=erowid): contains = data.keys()[0] obj_data = data[contains] if contains == "cf:RFC5070_IODEF_v1_pb2": iodef = RFC5070_IODEF_v1_pb2.IODEF_DocumentType() try: iodef.ParseFromString(obj_data) #print "IODEF: ", iodef ii = iodef.Incident[0] table_type = ii.Assessment[0].Impact[0].content.content self.L("\tIndexing: " + table_type) # check to make sure table_name is in index.secondary # index.secondary contains a list of configured/permitted secondary index types if not table_type in index_handler: self.L("index handler for table type %s doesnt exist, creating a new handler thread" % (table_type)) index_handler[table_type] = Indexer.Indexer(self.pool, table_type, self.num_servers, self.batch_size, self.debug) index_handler[table_type].extract(key, iodef) processed = processed + 1 except Exception as e: print "Failed to parse restored object: ", e traceback.print_exc() else: print "Contains an unsupported object type: ", contains time.sleep(5) if processed > 0: self.setcheckpoint(salt, endts)
global cf global exploder global primary_index global secondary_index global thread_tracker try: print "Connect to HBase" connectionPool = HBConnection(hbhost) with connectionPool.connection() as connection: cif_objs = connection.table('cif_objs').batch(batch_size=5) # set very low for development, set to 1000+ for test/qa/prod cif_idl = connection.table('cif_idl') print "Init Registry" registry = Registry(connectionPool, debug) num_servers = registry.get('hadoop.num_servers') if num_servers == None: num_servers = 1 print "hadoop.num_servers not set. defaulting." print "hadoop.num_servers = ", num_servers salt = Salt(num_servers, debug) thread_tracker = ThreadTracker(debug) global apikeys log = Log(connectionPool) log.L("cif-db initializing") print "Initializing APIKeys object"
class Exploder(object): def __init__ (self, hbhost, debug): self.debug = debug self.dbh = happybase.Connection(hbhost) t = self.dbh.tables() self.table = self.dbh.table('infrastructure_botnet') self.kickit = threading.Semaphore(0) self.proc_thread = threading.Thread(target=self.run, args=()) self.proc_thread.start() self.botnet_handler = Botnet.Botnet(self.dbh, debug) self.registry = Registry(hbhost, debug) self.num_servers = self.registry.get('hadoop.num_servers') if self.num_servers == None: self.num_servers = 1 self.salt = Salt(self.num_servers, self.debug) def L(self, msg): caller = ".".join([str(__name__), sys._getframe(1).f_code.co_name]) if self.debug != None: print caller + ": " + msg else: syslog.syslog(caller + ": " + msg) def do_some_work(self): self.kickit.release() def getcheckpoint(self): t = self.registry.get('exploder.checkpoint') if t != None: return t return 0 def setcheckpoint(self, ts): self.registry.set('exploder.checkpoint', ts) def run(self): self.L("Exploder running") while True: self.L("waiting for work") self.kickit.acquire() # will block provided kickit is 0 self.L("wakup") co = self.dbh.table('cif_objs') self.L("connected to cif_objs") startts = self.getcheckpoint() endts = int(time.time()) processed = 0 self.L("processing: " + str(startts) + " to " + str(endts)) salt = 0xFF00 # FIX fix in poc-db at the same time (in writeToDb()) srowid = struct.pack(">HIIIII", salt, startts, 0,0,0,0) erowid = struct.pack(">HIIIII", salt, endts, 0,0,0,0) for key, data in co.scan(row_start=srowid, row_stop=erowid): contains = data.keys()[0] obj_data = data[contains] if contains == "cf:RFC5070_IODEF_v1_pb2": iodef = RFC5070_IODEF_v1_pb2.IODEF_DocumentType() try: iodef.ParseFromString(obj_data) #print iodef ii = iodef.Incident[0] table_type = ii.Assessment[0].Impact[0].content.content rowkey = None if table_type == "botnet": self.L("botnet") self.botnet_handler.extract(iodef) self.botnet_handler.commit() elif table_type == "malware": self.L("malware") except Exception as e: print "Failed to parse restored object: ", e traceback.print_exc() self.setcheckpoint(endts+1)
if o == "-t": key_type = a elif o == "-v": key_value = a elif o == "-k": key_name = a elif o == "-d": del_name = a elif o == "-h": usage() sys.exit(2) elif o == "-D": debug = a reg = Registry("localhost", debug) if del_name != None: reg.delete(del_name) kv = reg.get(del_name) if kv != None: print "Failed to delete the key: it seems to still be in the database." elif key_name != None: if key_type != None and key_value != None: key_value = cast(key_type, key_value) reg.set(key_name, key_value) kv = reg.get(key_name) if kv == key_value: print key_name + " has been set to " + str(key_value) else:
except getopt.GetoptError, err: print str(err) usage() sys.exit(2) hbhost = "localhost" for o, a in opts: if o == "-h": usage() sys.exit(2) elif o == "-H": hbhost = a c = HBConnection(hbhost) registry = Registry(hbhost, False) num_servers = registry.get("hadoop.num_servers") if num_servers == None: num_servers = 1 primary_index_map = load_primary_index_map(registry) for table in c.tables(): if re.match("^index_", table): print "\nChecking: ", table index_entries_per_server = {} index_entries_per_primary_index = {} th = c.table(table)