def load_cert_db(self, main_file, bigdb): """ Loads big fprint -> certificate database to memory :param main_file: :param bigdb: :return: """ counter = 0 # Open the main file, gziped or not if main_file.endswith('gz'): fh = gzip.open(main_file, 'rb') else: fh = open(main_file, 'rb') errors = 0 with fh: for idx, line in enumerate(fh): try: fprint, cert = line.split(',', 1) cert = cert.strip() fprint = utils.strip_hex_prefix(fprint.strip()).lower() certbin = base64.b64decode(cert) bigdb[fprint] = certbin counter += 1 if counter % 10000 == 0: logger.debug( ' .. progress %s, fprint %s, memory: %s MB' % (counter, fprint, utils.get_mem_usage() / 1024.0)) except Exception as e: errors += 1 logger.error('Error in processing %s' % e) self.trace_logger.log(e) logger.info( 'Uff... big DB loaded, num entries: %s, errors: %s, memory: %s MB' % (len(bigdb), errors, utils.get_mem_mb()))
def display_stats(): draw = ImageDraw.Draw(draw_image) x = 0 padding = -1 top = padding bottom = height - padding # Draw a black filled box to clear the image. draw.rectangle((0, 0, width, height), outline=0, fill=0) draw.text((x, top), ("IP: %s" % utils.get_ip()), font=small_font, fill=255) draw.text((x, top + 8), ("Load: %2.1f%% MHz: %d" % (utils.get_cpu_load(), utils.get_arm_clockspeed())), font=small_font, fill=255) draw.text((x, top + 16), ("CPU: %2.1fC GPU:%2.1fC" % (utils.get_cpu_temp(), utils.get_gpu_temp())), font=small_font, fill=255) mem = utils.get_mem_usage() draw.text((x, top + 24), ("Mem: %d/%dMB %2.2f%%" % (mem[0], mem[1], mem[2])), font=small_font, fill=255) display_image(draw_image)
def process_iobj(self, iobj): """ Processing :param iobj: :return: """ input_name = self.iobj_name(iobj) logger.info('Processing: %s' % input_name) finish_file = self.get_finish_file(input_name) if os.path.exists(finish_file): logger.info('Finish indicator file exists, skipping: %s' % finish_file) return self.cur_decompressor = None self.cur_state_file = self.get_state_file(input_name) file_leafs = self.get_classification_leafs(input_name) file_roots = self.get_classification_roots(input_name) self.last_record_resumed = None self.processor = newline_reader.NewlineReader(is_json=False) handle = iobj name = str(iobj) if name.endswith('lz4'): self.cur_decompressor = lz4framed.Decompressor(handle) handle = self.cur_decompressor if not self.is_dry() and (not self.args.continue1 or not os.path.exists(file_leafs) or not os.path.exists(file_roots)): utils.safely_remove(file_leafs) utils.safely_remove(file_roots) self.file_leafs_fh = utils.safe_open(file_leafs, mode='w', chmod=0o644) self.file_roots_fh = utils.safe_open(file_roots, mode='w', chmod=0o644) elif self.args.continue1: logger.info('Continuing with the started files') self.file_leafs_fh = open(file_leafs, mode='r+' if not self.is_dry() else 'r') self.file_roots_fh = open(file_roots, mode='r+' if not self.is_dry() else 'r') self.restore_checkpoint(iobj) self.continue_leafs(file_leafs) with iobj: resume_token_found = False resume_token = None resume_idx = 0 record_ctr = -1 already_processed = 0 read_start = self.read_data for idx, record in self.processor.process(handle): try: record_ctr += 1 self.read_data += len(record) # Check the checkpoint distance + boundary - process all newline chunks available if self.read_data - self.last_report >= 1024 * 1024 * 1024 and self.processor.step_cur_last_element: logger.info( '...progress: %s GB, idx: %s, pos: %s GB, ' 'found: %s, mem: %04.8f MB, readpos: %s (%4.6f GB)' % (self.read_data / 1024.0 / 1024.0 / 1024.0, idx, self.read_data, self.num_found, utils.get_mem_usage() / 1024.0, iobj.tell(), iobj.tell() / 1024.0 / 1024.0 / 1024.0)) self.last_report = self.read_data self.try_store_checkpoint(iobj=iobj, idx=idx, resume_idx=resume_idx, resume_token=resume_token) # Flush already seen IP database, not needed anymore # we are too far from the resumed checkpoint if read_start + 1024 * 1024 * 1024 * 2 > self.read_data: self.state_loaded_ips = set() js = json.loads(record) self.process_record(idx, js) except Exception as e: logger.error('Exception in processing %d: %s' % (self.ctr, e)) logger.debug(traceback.format_exc()) logger.debug(record) self.ctr += 1 logger.info('Total: %d' % self.ctr) logger.info('Total_chain: %d' % self.chain_ctr) logger.info('Not tls: %d' % self.not_tls) logger.info('Not cert ok: %d' % self.not_cert_ok) logger.info('Not chain ok: %d' % self.not_chain_ok) logger.info('Not parsed: %d' % self.not_parsed) logger.info('Not rsa: %d' % self.not_rsa) logger.info('Processed: %s' % iobj) if not self.is_dry(): self.file_leafs_fh.close() self.file_roots_fh.close() utils.try_touch(finish_file)
def process(self, iobj): """ Process input object - read LZ4, produce metadata :param iobj: :return: """ input_name = self.iobj_name(iobj) logger.info('Processing: %s' % input_name) finish_file = self.get_finish_file(input_name) if os.path.exists(finish_file): logger.info('Finish indicator file exists, skipping: %s' % finish_file) return self.cur_decompressor = None self.cur_state_file = self.get_state_file(input_name) self.processor = newline_reader.NewlineReader(is_json=False) if self.args.copy_dir is not None: copy_path = os.path.join(self.args.copy_dir, input_name) logger.info('Going to create a copy to %s' % copy_path) self.cur_copy_fh = open(copy_path, 'w') handle = iobj name = str(iobj) if self.cur_copy_fh is not None: handle = input_obj.TeeInputObject(parent_fh=handle, copy_fh=self.cur_copy_fh) if name.endswith('lz4'): self.cur_decompressor = lz4framed.Decompressor(handle) handle = self.cur_decompressor if self.args.continue1: logger.info('Continuing with the started files') self.restore_checkpoint(iobj) with iobj: record_ctr = -1 read_start = self.read_data for idx, record in self.processor.process(handle): try: record_ctr += 1 self.read_data += len(record) # Check the checkpoint distance + boundary - process all newline chunks available if self.read_data - self.last_report >= 1024 * 1024 * 1024 and self.processor.step_cur_last_element: logger.info( '...progress: %s GB, idx: %s, pos: %s GB, mem: %04.8f MB, readpos: %s (%4.6f GB)' % (self.read_data / 1024.0 / 1024.0 / 1024.0, idx, self.read_data, utils.get_mem_usage() / 1024.0, iobj.tell(), iobj.tell() / 1024.0 / 1024.0 / 1024.0)) self.last_report = self.read_data self.try_store_checkpoint(iobj=iobj, idx=idx) # Flush already seen IP database, not needed anymore # we are too far from the resumed checkpoint if read_start + 1024 * 1024 * 1024 * 2 > self.read_data: self.state_loaded_ips = set() except Exception as e: logger.error('Exception in processing %d: %s' % (self.ctr, e)) logger.debug(traceback.format_exc()) self.ctr += 1 logger.info('Processed: %s' % iobj) if self.cur_copy_fh is not None: self.cur_copy_fh.close() utils.try_touch(finish_file)
def _export(self): return dict(memory_usage=utils.get_mem_usage())
def main(self): """ Processing censys sonar.ssl Recodes one big certificate file to smaller _certs.gz files as published since 2015 so we can process it in the same way. https://scans.io/study/sonar.ssl :return: """ parser = argparse.ArgumentParser( description='Recoding big sonarssl file to the incremental one') parser.add_argument('--url', dest='url', nargs=argparse.ZERO_OR_MORE, default=[], help='censys links') parser.add_argument('--json', dest='json', default=None, help='sonar links json') parser.add_argument('--datadir', dest='datadir', default='.', help='datadir') parser.add_argument('--fprint-only', dest='fprint_only', default=False, action='store_const', const=True, help='Only fprint gen') parser.add_argument('--base-only', dest='base_only', default=False, action='store_const', const=True, help='Chunk only one big dataset sample') parser.add_argument('file', nargs=argparse.ZERO_OR_MORE, default=[], help='censys link file') args = parser.parse_args() # Big in memory hash table fprint -> certificate bigdb = {} testrng = range(10, 93) if args.base_only else range(10, 181) # fprints seen fprints_seen_set = set() fprints_previous = set() if not args.fprint_only: if len(args.file) == 0: return main_file = args.file[0] self.load_cert_db(main_file, bigdb) jsdb = None with open(args.json, 'r') as fh: jsdb = json.load(fh) jsdb_ids = {x['id']: x for x in jsdb['data']} for test_idx in testrng: files = jsdb_ids[test_idx]['files'] filerec = None for tmprec in files: if '_hosts.gz' in tmprec: filerec = files[tmprec] break fname = filerec['name'] flink = filerec['href'] # 20131104/20131104_hosts.gz fname_2 = fname.split('/') if len(fname_2) == 2: fname_2 = fname_2[1] else: fname_2 = fname_2[0] dateparts = fname_2.split('_') datepart = dateparts[0] hostfile = os.path.join(args.datadir, '%s_hosts.gz' % datepart) certfile = os.path.join(args.datadir, '%s_certs.gz' % datepart) fprintfile = os.path.join(args.datadir, '%s_fprints.csv' % datepart) fprintfile_new = os.path.join(args.datadir, '%s_fprints_new.csv' % datepart) fprintfile_new_p = os.path.join(args.datadir, '%s_fprints_new_p.csv' % datepart) fprintfile_lost_p = os.path.join( args.datadir, '%s_fprints_lost_p.csv' % datepart) fprintfile_same = os.path.join(args.datadir, '%s_fprints_same.csv' % datepart) logger.info('Processing test idx %s, file %s, newfile: %s' % (test_idx, fname, certfile)) not_found = 0 fprints_set = set() fprints_set_new = set() iobj = None hosth = None if os.path.exists(hostfile): iobj = input_obj.FileInputObject(fname=hostfile) elif args.fprint_only: continue else: hosth = open(hostfile, 'wb') iobj = input_obj.ReconnectingLinkInputObject(url=flink, rec=files) iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True) # Reading host file, ip -> fprints associations with iobj: fh = gzipinputstream.GzipInputStream(fileobj=iobj) for rec_idx, rec in enumerate(fh): try: linerec = rec.strip().split(',') ip = linerec[0].strip() fprints = linerec[1:] for fprint in fprints: fprint = utils.strip_hex_prefix( fprint.strip()).lower() fprints_set.add(fprint) if rec_idx % 1000000 == 0: iobj.flush() logger.debug( ' .. progress %s, ip %s, mem: %s MB' % (rec_idx, ip, utils.get_mem_usage() / 1024.0)) except Exception as e: logger.error('Exception in processing rec %s: %s' % (rec_idx, e)) logger.debug(rec) logger.debug(traceback.format_exc()) fprints_len = len(fprints_set) logger.info('File processed, fprint db size: %d. Mem: %s MB' % (fprints_len, utils.get_mem_mb())) # Only fingerprints logger.info('Going to sort fprints...') fprints = list(fprints_set) fprints.sort() logger.info('fprints sorted. Storing fingerprints. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) # Store only new fingerprints, not seen before logger.info('Storing new fingerprints. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) with open(fprintfile_new, 'w') as outfh: for fprint in fprints: if fprint not in fprints_seen_set: outfh.write('%s\n' % fprint) fprints_set_new.add(fprint) fprints_seen_set.add(fprint) # Certificates new from previous logger.info('Storing new fingerprints from previous. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) with open(fprintfile_new_p, 'w') as outfh: for fprint in fprints: if fprint not in fprints_previous: outfh.write('%s\n' % fprint) # Certificates removed from previous logger.info('Storing lost fingerprints from previous. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) fprints_previous_list = list(fprints_previous) fprints_previous_list.sort() with open(fprintfile_lost_p, 'w') as outfh: for fprint in fprints_previous_list: if fprint not in fprints_set: outfh.write('%s\n' % fprint) # Certificates same as in the previous dataset logger.info('Storing same fingerprints as previous. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) with open(fprintfile_same, 'w') as outfh: for fprint in fprints: if fprint in fprints_previous: outfh.write('%s\n' % fprint) # Store only fingerprints contained in this set. with open(fprintfile, 'w') as outfh: for fprint in fprints: outfh.write('%s\n' % fprint) if args.fprint_only: fprints_previous = set(fprints_set) continue # Certificates file _certs.gz - only new certificates fprints_new = list(fprints_set_new) fprints_new.sort() fprints_len = len(fprints_new) fprints_progress_unit = fprints_len / 100 fprints_progress_last = 0 logger.info('Dumping only new certificates, fprint db size: %d' % fprints_len) with gzip.open(certfile, 'wb') as outfh: for rec_idx, fprint in enumerate(fprints_new): if fprints_progress_last + fprints_progress_unit < rec_idx: fprints_progress_last = rec_idx outfh.flush() logger.debug(' .. progress %s, mem: %s MB' % (rec_idx, utils.get_mem_usage() / 1024.0)) if fprint in bigdb: outfh.write('%s,%s\n' % (fprint, base64.b64encode(bigdb[fprint]))) else: not_found += 1 logger.info( 'Finished with idx %s, file %s, newfile: %s, not found: %s, mem: %s MB' % (test_idx, fname, certfile, not_found, utils.get_mem_usage() / 1024.0)) # Final step - store to previous fprints_previous = set(fprints_set)