def process_hosts(q, es): """ :param q: The Queue object that hosts should be pulled off of :param es: An Elasticsearch connection. This way each worker has its own connection and you don't have to share it across multiple workers/processes :return: """ bulk_hosts = [] while True: line = q.get() if line == "DONE": bulk(es, bulk_hosts) return True host = proccess_host(line) cert_hash = hashlib.sha1(host['host'] + host['hash'] + host['source']) cert_hash = cert_hash.hexdigest() action = { "_op_type": "update", "_index": 'passive-ssl-hosts-sonar', "_type": "host", "_id": cert_hash, "doc": host, "doc_as_upsert": "true" } bulk_hosts.append(action) if len(bulk_hosts) == 500: bulk(es, bulk_hosts) bulk_hosts = []
def process_hosts(q, es, initial): """ :param q: The Queue object that hosts should be pulled off of :param es: An Elasticsearch connection. This way each worker has its own connection and you don't have to share it across multiple workers/processes :param initial: If this is the initial upload then we set the first_seen = last_seen. Other wise first_seen is left blank and will be cleaned up later :return: """ bulk_hosts = [] while True: line = q.get() if line == "DONE": bulk(es, bulk_hosts) return True host = proccess_host(line) cert_hash = hashlib.sha1(host['host']+host['hash']+host['source']) cert_hash = cert_hash.hexdigest() if initial: host['first_seen'] = host['last_seen'] action = {"_op_type": "update", "_index": 'passive-ssl-hosts-umich', "_type": "host", "_id": cert_hash, "doc": line, "doc_as_upsert": "true"} bulk_hosts.append(action) if len(bulk_hosts) == 500: bulk(es, bulk_hosts) bulk_hosts = []
def process_hosts_file(gzfilename, key, logger, host='localhost', batchsize=16384, index='hosts', sourcetype='sonar-host', useesid=False): logger.warning("Loading file {f} at {d}".format(f=gzfilename, d=datetime.now())) hec = http_event_collector(key, host) with gzip.open(gzfilename, 'rb') as resultsfile: m = re.search('.*\/(\d{8})', gzfilename) filedate = m.group(1) filedate_struct = time.strptime(filedate, "%Y%m%d") filedate_epoch = time.mktime(filedate_struct) batchcount = 0 for line in resultsfile: cleanline = line.strip('\n') (host, certhash) = cleanline.split(',', 1) newhost = {} newhost['host'] = host newhost['hash'] = certhash newhost['seen'] = filedate newhost['seen_epoch'] = filedate_epoch if useesid: cert_hash = hashlib.sha1(newhost['host']+newhost['hash']+'sonar') newhost['id'] = cert_hash.hexdigest() newhost = proccess_host(newhost, logger) payload = {} payload.update({"index":index}) payload.update({"host":host}) payload.update({"sourcetype":sourcetype}) payload.update({"source":gzfilename}) payload.update({"event":newhost}) hec.batchEvent(payload) batchcount = batchcount + 1 if batchcount == batchsize: hec.flushBatch() batchcount = 0 if batchcount > 0: hec.flushBatch()
def process_hosts_file(file_queue, key, hostlist=['localhost'], index='sonarsslhost', sourcetype='sonarsslhost', batchsize=16384, useesid=False): logger = logging.getLogger('SSLImporter') while True: host = random.choice(hostlist) print host hec = http_event_collector(key, host) gzfilename = file_queue.get() if gzfilename == "DONE": return True logger.warning("Loading file {f} at {d}".format(f=gzfilename, d=datetime.now())) with gzip.open(gzfilename, 'rb') as resultsfile: m = re.search('.*\/(\d{8})', gzfilename) if m: filedate = m.group(1) else: m = re.search('.*\/(\d{4}-\d{2}-\d{2})', gzfilename) filedate = m.group(1) filedate = re.sub('-', '', filedate, 0, 0) filedate_struct = time.strptime(filedate, "%Y%m%d") filedate_epoch = time.mktime(filedate_struct) batchcount = 0 for line in resultsfile: cleanline = line.strip('\n') (host, certhash) = cleanline.split(',', 1) newhost = {} newhost['host'] = host newhost['hash'] = certhash newhost['seen'] = filedate newhost['seen_epoch'] = filedate_epoch if useesid: cert_hash = hashlib.sha1(newhost['host'] + newhost['hash'] + 'sonar') newhost['id'] = cert_hash.hexdigest() newhost = proccess_host(newhost, logger) payload = {} payload.update({"index": index}) payload.update({"host": host}) payload.update({"sourcetype": sourcetype}) payload.update({"source": gzfilename}) payload.update({"event": newhost}) hec.batchEvent(payload) batchcount = batchcount + 1 if batchcount == batchsize: hec.flushBatch() batchcount = 0 if batchcount > 0: hec.flushBatch()
def process_hosts(q, es): """ :param q: The Queue object that hosts should be pulled off of :param es: An Elasticsearch connection. This way each worker has its own connection and you don't have to share it across multiple workers/processes :return: """ bulk_hosts = [] while True: line = q.get() if line == "DONE": bulk(es, bulk_hosts) return True host = proccess_host(line) cert_hash = hashlib.sha1(host['host']+host['hash']+host['source']) cert_hash = cert_hash.hexdigest() action = {"_op_type": "update", "_index": 'passive-ssl-hosts-sonar', "_type": "host", "_id": cert_hash, "doc": host, "doc_as_upsert": "true"} bulk_hosts.append(action) if len(bulk_hosts) == 500: bulk(es, bulk_hosts) bulk_hosts = []