def report(self): """ pass :return: """ ct = time.time() if ct - self.state_last_dump < self.state_time_dump: return logger.debug( '.. rsa: %s, non-rsa: %s, errs: %s, nofpr: %s, nor: %s, exp: %s, found: %s, not CA:' ' %s, mem: %s MB, depth: %s, cfile: %s' % (self.num_rsa, self.num_non_rsa, self.num_errs, self.num_no_fprint_raw, self.num_no_raw, self.num_expired, self.num_found, self.num_not_ca, utils.get_mem_mb(), self.cur_depth, self.cur_file)) self.state_last_dump = ct
def load_cert_db(self, main_file, bigdb): """ Loads big fprint -> certificate database to memory :param main_file: :param bigdb: :return: """ counter = 0 # Open the main file, gziped or not if main_file.endswith('gz'): fh = gzip.open(main_file, 'rb') else: fh = open(main_file, 'rb') errors = 0 with fh: for idx, line in enumerate(fh): try: fprint, cert = line.split(',', 1) cert = cert.strip() fprint = utils.strip_hex_prefix(fprint.strip()).lower() certbin = base64.b64decode(cert) bigdb[fprint] = certbin counter += 1 if counter % 10000 == 0: logger.debug( ' .. progress %s, fprint %s, memory: %s MB' % (counter, fprint, utils.get_mem_usage() / 1024.0)) except Exception as e: errors += 1 logger.error('Error in processing %s' % e) self.trace_logger.log(e) logger.info( 'Uff... big DB loaded, num entries: %s, errors: %s, memory: %s MB' % (len(bigdb), errors, utils.get_mem_mb()))
def process_iobj(self, iobj): """ Processing :param iobj: :return: """ input_name = self.iobj_name(iobj) logger.info('Processing: %s' % input_name) finish_file = self.get_finish_file(input_name) if os.path.exists(finish_file): logger.info('Finish indicator file exists, skipping: %s' % finish_file) return self.cur_decompressor = None self.cur_state_file = self.get_state_file(input_name) file_leafs = self.get_classification_leafs(input_name) file_roots = self.get_classification_roots(input_name) self.last_record_resumed = None self.processor = newline_reader.NewlineReader(is_json=False) handle = iobj name = str(iobj) if name.endswith('lz4'): self.cur_decompressor = lz4framed.Decompressor(handle) handle = self.cur_decompressor if not self.is_dry() and (not self.args.continue1 or not os.path.exists(file_leafs) or not os.path.exists(file_roots)): utils.safely_remove(file_leafs) utils.safely_remove(file_roots) self.file_leafs_fh = utils.safe_open(file_leafs, mode='w', chmod=0o644) self.file_roots_fh = utils.safe_open(file_roots, mode='w', chmod=0o644) elif self.args.continue1: logger.info('Continuing with the started files') self.file_leafs_fh = open(file_leafs, mode='r+' if not self.is_dry() else 'r') self.file_roots_fh = open(file_roots, mode='r+' if not self.is_dry() else 'r') self.restore_checkpoint(iobj) self.continue_roots() self.continue_leafs(file_leafs) with iobj: resume_token_found = False resume_token = None resume_idx = 0 record_ctr = -1 already_processed = 0 read_start = self.read_data for idx, record in self.processor.process(handle): try: record_ctr += 1 self.read_data += len(record) # Check the checkpoint distance + boundary - process all newline chunks available if self.read_data - self.last_report >= 1024 * 1024 * 1024 and self.processor.step_cur_last_element: logger.info( '...progress: %s GB, idx: %s, pos: %s GB, mem: %04.8f MB, readpos: %s (%4.6f GB)' % (self.read_data / 1024.0 / 1024.0 / 1024.0, idx, self.read_data, utils.get_mem_mb(), iobj.tell(), iobj.tell() / 1024.0 / 1024.0 / 1024.0)) self.last_report = self.read_data self.try_store_checkpoint(iobj=iobj, idx=idx, resume_idx=resume_idx, resume_token=resume_token) # Flush already seen IP database, not needed anymore # we are too far from the resumed checkpoint if read_start + 1024 * 1024 * 1024 * 2 > self.read_data: self.state_loaded_ips = set() js = json.loads(record) # If there are more records after the last checkpoint load, skip duplicates if js['ip'] in self.state_loaded_ips: already_processed += 1 continue self.process_record(idx, js) except Exception as e: logger.error('Exception in processing %d: %s' % (self.ctr, e)) logger.debug(traceback.format_exc()) logger.debug(record) self.ctr += 1 logger.info('Total: %d' % self.ctr) logger.info('Total_chain: %d' % self.chain_ctr) logger.info('Not tls: %d' % self.not_tls) logger.info('Not cert ok: %d' % self.not_cert_ok) logger.info('Not chain ok: %d' % self.not_chain_ok) logger.info('Not parsed: %d' % self.not_parsed) logger.info('Not rsa: %d' % self.not_rsa) logger.info('Processed: %s' % iobj) if not self.is_dry(): self.file_leafs_fh.close() self.file_roots_fh.close() utils.try_touch(finish_file)
def main(self): """ Processing censys sonar.ssl Recodes one big certificate file to smaller _certs.gz files as published since 2015 so we can process it in the same way. https://scans.io/study/sonar.ssl :return: """ parser = argparse.ArgumentParser( description='Recoding big sonarssl file to the incremental one') parser.add_argument('--url', dest='url', nargs=argparse.ZERO_OR_MORE, default=[], help='censys links') parser.add_argument('--json', dest='json', default=None, help='sonar links json') parser.add_argument('--datadir', dest='datadir', default='.', help='datadir') parser.add_argument('--fprint-only', dest='fprint_only', default=False, action='store_const', const=True, help='Only fprint gen') parser.add_argument('--base-only', dest='base_only', default=False, action='store_const', const=True, help='Chunk only one big dataset sample') parser.add_argument('file', nargs=argparse.ZERO_OR_MORE, default=[], help='censys link file') args = parser.parse_args() # Big in memory hash table fprint -> certificate bigdb = {} testrng = range(10, 93) if args.base_only else range(10, 181) # fprints seen fprints_seen_set = set() fprints_previous = set() if not args.fprint_only: if len(args.file) == 0: return main_file = args.file[0] self.load_cert_db(main_file, bigdb) jsdb = None with open(args.json, 'r') as fh: jsdb = json.load(fh) jsdb_ids = {x['id']: x for x in jsdb['data']} for test_idx in testrng: files = jsdb_ids[test_idx]['files'] filerec = None for tmprec in files: if '_hosts.gz' in tmprec: filerec = files[tmprec] break fname = filerec['name'] flink = filerec['href'] # 20131104/20131104_hosts.gz fname_2 = fname.split('/') if len(fname_2) == 2: fname_2 = fname_2[1] else: fname_2 = fname_2[0] dateparts = fname_2.split('_') datepart = dateparts[0] hostfile = os.path.join(args.datadir, '%s_hosts.gz' % datepart) certfile = os.path.join(args.datadir, '%s_certs.gz' % datepart) fprintfile = os.path.join(args.datadir, '%s_fprints.csv' % datepart) fprintfile_new = os.path.join(args.datadir, '%s_fprints_new.csv' % datepart) fprintfile_new_p = os.path.join(args.datadir, '%s_fprints_new_p.csv' % datepart) fprintfile_lost_p = os.path.join( args.datadir, '%s_fprints_lost_p.csv' % datepart) fprintfile_same = os.path.join(args.datadir, '%s_fprints_same.csv' % datepart) logger.info('Processing test idx %s, file %s, newfile: %s' % (test_idx, fname, certfile)) not_found = 0 fprints_set = set() fprints_set_new = set() iobj = None hosth = None if os.path.exists(hostfile): iobj = input_obj.FileInputObject(fname=hostfile) elif args.fprint_only: continue else: hosth = open(hostfile, 'wb') iobj = input_obj.ReconnectingLinkInputObject(url=flink, rec=files) iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True) # Reading host file, ip -> fprints associations with iobj: fh = gzipinputstream.GzipInputStream(fileobj=iobj) for rec_idx, rec in enumerate(fh): try: linerec = rec.strip().split(',') ip = linerec[0].strip() fprints = linerec[1:] for fprint in fprints: fprint = utils.strip_hex_prefix( fprint.strip()).lower() fprints_set.add(fprint) if rec_idx % 1000000 == 0: iobj.flush() logger.debug( ' .. progress %s, ip %s, mem: %s MB' % (rec_idx, ip, utils.get_mem_usage() / 1024.0)) except Exception as e: logger.error('Exception in processing rec %s: %s' % (rec_idx, e)) logger.debug(rec) logger.debug(traceback.format_exc()) fprints_len = len(fprints_set) logger.info('File processed, fprint db size: %d. Mem: %s MB' % (fprints_len, utils.get_mem_mb())) # Only fingerprints logger.info('Going to sort fprints...') fprints = list(fprints_set) fprints.sort() logger.info('fprints sorted. Storing fingerprints. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) # Store only new fingerprints, not seen before logger.info('Storing new fingerprints. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) with open(fprintfile_new, 'w') as outfh: for fprint in fprints: if fprint not in fprints_seen_set: outfh.write('%s\n' % fprint) fprints_set_new.add(fprint) fprints_seen_set.add(fprint) # Certificates new from previous logger.info('Storing new fingerprints from previous. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) with open(fprintfile_new_p, 'w') as outfh: for fprint in fprints: if fprint not in fprints_previous: outfh.write('%s\n' % fprint) # Certificates removed from previous logger.info('Storing lost fingerprints from previous. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) fprints_previous_list = list(fprints_previous) fprints_previous_list.sort() with open(fprintfile_lost_p, 'w') as outfh: for fprint in fprints_previous_list: if fprint not in fprints_set: outfh.write('%s\n' % fprint) # Certificates same as in the previous dataset logger.info('Storing same fingerprints as previous. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) with open(fprintfile_same, 'w') as outfh: for fprint in fprints: if fprint in fprints_previous: outfh.write('%s\n' % fprint) # Store only fingerprints contained in this set. with open(fprintfile, 'w') as outfh: for fprint in fprints: outfh.write('%s\n' % fprint) if args.fprint_only: fprints_previous = set(fprints_set) continue # Certificates file _certs.gz - only new certificates fprints_new = list(fprints_set_new) fprints_new.sort() fprints_len = len(fprints_new) fprints_progress_unit = fprints_len / 100 fprints_progress_last = 0 logger.info('Dumping only new certificates, fprint db size: %d' % fprints_len) with gzip.open(certfile, 'wb') as outfh: for rec_idx, fprint in enumerate(fprints_new): if fprints_progress_last + fprints_progress_unit < rec_idx: fprints_progress_last = rec_idx outfh.flush() logger.debug(' .. progress %s, mem: %s MB' % (rec_idx, utils.get_mem_usage() / 1024.0)) if fprint in bigdb: outfh.write('%s,%s\n' % (fprint, base64.b64encode(bigdb[fprint]))) else: not_found += 1 logger.info( 'Finished with idx %s, file %s, newfile: %s, not found: %s, mem: %s MB' % (test_idx, fname, certfile, not_found, utils.get_mem_usage() / 1024.0)) # Final step - store to previous fprints_previous = set(fprints_set)
def main(): # Parse command line arguments parser = argparse.ArgumentParser(description='IP addr diff') parser.add_argument('--datadir', dest='datadir', default='.', help='datadir') parser.add_argument('--test', dest='test', default='', help='test name') parser.add_argument('files', nargs=argparse.ZERO_OR_MORE, default=[], help='files') args = parser.parse_args() ip_first = set() ip_second = set() if len(args.files) < 2: logger.error('Has to have at least 2 files') return logger.info('Loading first file...') load_set(args.files[0], ip_first) logger.info('File loaded, #of ip addresses: %s, mem: %s MB' % (len(ip_first), utils.get_mem_mb())) logger.info('Loading second file...') load_set(args.files[1], ip_second) logger.info('File loaded, #of ip addresses: %s, mem: %s MB' % (len(ip_second), utils.get_mem_mb())) path_ab = os.path.join(args.datadir, '%s_a_min_b.csv' % args.test) path_ba = os.path.join(args.datadir, '%s_b_min_a.csv' % args.test) path_sym = os.path.join(args.datadir, '%s_a_sym_b.csv' % args.test) path_int = os.path.join(args.datadir, '%s_a_intersection_b.csv' % args.test) path_uni = os.path.join(args.datadir, '%s_a_union_b.csv' % args.test) logger.info('Dumping a - b') with open(path_ab, 'w') as fh: res_set = sorted(list(ip_first - ip_second)) for x in res_set: fh.write('%s\n' % x) logger.info('Dumping b - a') with open(path_ba, 'w') as fh: res_set = sorted(list(ip_second - ip_first)) for x in res_set: fh.write('%s\n' % x) logger.info('Dumping a ^ b') with open(path_sym, 'w') as fh: res_set = sorted(list(ip_first ^ ip_second)) for x in res_set: fh.write('%s\n' % x) logger.info('Dumping a & b') with open(path_int, 'w') as fh: res_set = sorted(list(ip_first & ip_second)) for x in res_set: fh.write('%s\n' % x) logger.info('Dumping a | b') with open(path_uni, 'w') as fh: res_set = sorted(list(ip_first | ip_second)) for x in res_set: fh.write('%s\n' % x)
def work(self): """ Processing :return: """ # Open the json link file args = self.args index_db = None with open(args.json, 'r') as fh: index_db = json.load(fh) self.index_db = index_db # Manage the raw_certificates file main_cert_rec = index_db['data'][1] main_cert_link = main_cert_rec['fhref'] main_cert_file = os.path.join(args.datadir, os.path.basename(main_cert_link)) json_res_file = os.path.join(args.datadir, 'eco.json') json_res_fh = open(json_res_file, 'w') iobj = None if os.path.exists(main_cert_file): iobj = input_obj.FileInputObject(fname=main_cert_file) elif os.path.exists(main_cert_file[:-3]): # ungziped main_cert_file = main_cert_file[:-3] iobj = input_obj.FileInputObject(fname=main_cert_file) else: logger.info('Going to download certificate file') hosth = open(main_cert_file, 'wb') iobj = input_obj.ReconnectingLinkInputObject(url=main_cert_link, rec=main_cert_rec) iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True) # Big in memory hash table fprint -> certificate bigdb = {} counter = 0 testrng = range(15, 171) # fprints seen fprints_seen_set = set() fprints_previous = set() # Process the main certificate file with iobj: fh = iobj if main_cert_file.endswith('.gz'): fh = gzipinputstream.GzipInputStream(fileobj=iobj) else: fh = NewlineIterator(iobj) for idx, line in enumerate(fh): try: csv = line.split(',') fprint = utils.strip_hex_prefix(csv[0].strip()) cert = utils.strip_hex_prefix(csv[1].strip()) certbin = base64.b16decode(cert, True) bigdb[fprint] = certbin counter += 1 if counter % 100000 == 0: logger.debug(' .. progress %s, fprint %s, memory: %s MB' % (counter, fprint, utils.get_mem_mb())) except Exception as e: logger.error('Error in processing %s' % e) logger.debug(traceback.format_exc()) logger.info('Uff... big DB loaded, num entries: %s' % len(bigdb)) # Load sequential scans # Host file association ip -> fingerprint jsdb_ids = {x['id']: x for x in self.index_db['data']} last_file_date_utc = 0 for test_idx in testrng: filerec = jsdb_ids[test_idx] fname = filerec['fname'] flink = filerec['fhref'] datepart = filerec['date'] date_utc = filerec['date_utc'] fname_2 = os.path.basename(fname) # As dataset is in a form of snapshots we can skip some time intervals. if self.args.space and date_utc - last_file_date_utc < (60*60*24*7 - 60*60): logger.info('Skipping record %d, as the time diff is too small from the previous one: %s' % (test_idx, date_utc - last_file_date_utc)) continue last_file_date_utc = date_utc hostfile = os.path.join(args.datadir, fname_2) certfile = os.path.join(args.datadir, '%s_certs.gz' % datepart) fprintfile_new = os.path.join(args.datadir, '%s_fprints_new.csv' % datepart) fprintfile_new_p = os.path.join(args.datadir, '%s_fprints_new_p.csv' % datepart) fprintfile_lost_p = os.path.join(args.datadir, '%s_fprints_lost_p.csv' % datepart) js_res_rec = collections.OrderedDict() js_res_rec['fname'] = fname js_res_rec['fhref'] = flink js_res_rec['date'] = datepart js_res_rec['date_utc'] = date_utc js_res_rec['hostfile'] = hostfile js_res_rec['certfile'] = certfile js_res_rec['fprintfile_new'] = fprintfile_new js_res_rec['rec'] = filerec logger.info('Processing test idx %s, file %s' % (test_idx, fname)) not_found = 0 fprints_set = set() fprints_set_new = set() iobj = None hosth = None # Open local or open remote (with download to local) if os.path.exists(hostfile): iobj = input_obj.FileInputObject(fname=hostfile) else: hosth = open(hostfile, 'wb') iobj = input_obj.ReconnectingLinkInputObject(url=flink, rec=filerec) iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True) # Processing ip -> fingerprints associations with iobj: fh = gzipinputstream.GzipInputStream(fileobj=iobj) for rec_idx, rec in enumerate(fh): try: linerec = rec.strip().split(',') ip = linerec[0].strip() fprint = utils.strip_hex_prefix(linerec[2].strip()) fprints_set.add(fprint) if rec_idx % 1000000 == 0: iobj.flush() logger.debug(' .. progress %s, ip %s, mem: %s MB' % (rec_idx, ip, utils.get_mem_mb())) except Exception as e: logger.error('Exception in processing rec %s: %s' % (rec_idx, e)) logger.debug(rec) logger.debug(traceback.format_exc()) fprints_len = len(fprints_set) logger.info('File processed, fprint db size: %d' % fprints_len) # Only fingerprints logger.info('Going to sort fprints...') fprints = list(fprints_set) fprints.sort() logger.info('fprints sorted. Storing fingerprints. Mem: %s MB' % (utils.get_mem_mb())) # Store only new fingerprints, not seen before logger.info('Storing new fingerprints. Mem: %s MB' % (utils.get_mem_mb())) with open(fprintfile_new, 'w') as outfh: for fprint in fprints: if fprint not in fprints_seen_set: outfh.write('%s\n' % fprint) fprints_set_new.add(fprint) fprints_seen_set.add(fprint) # Certificates new from previous logger.info('Storing new fingerprints from previous. Mem: %s MB' % (utils.get_mem_mb())) with open(fprintfile_new_p, 'w') as outfh: for fprint in fprints: if fprint not in fprints_previous: outfh.write('%s\n' % fprint) # Certificates removed from previous logger.info('Storing lost fingerprints from previous. Mem: %s MB' % (utils.get_mem_mb())) fprints_previous_list = list(fprints_previous) fprints_previous_list.sort() with open(fprintfile_lost_p, 'w') as outfh: for fprint in fprints_previous_list: if fprint not in fprints_set: outfh.write('%s\n' % fprint) # Certificates file _certs.gz - only new certificates, incremental records fprints_new = list(fprints_set_new) fprints_new.sort() fprints_len = len(fprints_new) fprints_progress_unit = fprints_len / 100 fprints_progress_last = 0 logger.info('Dumping only new certificates, fprint db size: %d' % fprints_len) with gzip.open(certfile, 'wb') as outfh: for rec_idx, fprint in enumerate(fprints_new): if fprints_progress_last + fprints_progress_unit < rec_idx: fprints_progress_last = rec_idx outfh.flush() logger.debug(' .. progress %s, mem: %s MB' % (rec_idx, utils.get_mem_mb())) if fprint in bigdb: outfh.write('%s,%s\n' % (fprint, base64.b64encode(bigdb[fprint]))) else: not_found += 1 logger.info('Finished with idx %s, file %s, newfile: %s, not found: %s, mem: %s MB' % (test_idx, fname, certfile, not_found, utils.get_mem_mb())) # Final step - store to previous fprints_previous = set(fprints_set) # Result file record flush json_res_fh.write(json.dumps(js_res_rec) + '\n') json_res_fh.flush() json_res_fh.close()