def generate_workset(self): """ Prepares input objects for processing :return: """ # Build input objects for file_name in self.args.file: iobj = input_obj.FileInputObject(file_name, rec=None) self.input_objects.append(iobj) for url in self.args.url: iobj = self._build_link_object(url=url, rec=None) self.input_objects.append(iobj) link_indices = None if len(self.args.link_idx) > 0: link_indices = set([int(x) for x in self.args.link_idx]) for link_file in self.args.link_file: with open(link_file, 'r') as fh: data = fh.read() js = json.loads(data) datasets = js['data'] for dataset in datasets: did = dataset['id'] if link_indices is not None and did not in link_indices: continue iobj = self._build_link_object( url=dataset['files']['zgrab-results.json.lz4']['href'], rec=dataset) self.input_objects.append(iobj)
def work(self): """ Entry point after argument processing. :return: """ iobjs = [] for url in self.args.url: iobjs.append(input_obj.ReconnectingLinkInputObject(url=url)) for file in self.args.file: iobjs.append(input_obj.FileInputObject(file)) for iobj in iobjs: self.process(iobj)
def main(): """ Processing censys link page to the json https://scans.io/study/sonar.ssl :return: """ parser = argparse.ArgumentParser( description='Processes SonarSSL links from the page, generates json') parser.add_argument('--url', dest='url', nargs=argparse.ZERO_OR_MORE, default=[], help='censys links') parser.add_argument('file', nargs=argparse.ZERO_OR_MORE, default=[], help='censys link file') args = parser.parse_args() # Process the input dataset_idx = 10 datasets = {} input_objects = [] for file_name in args.file: input_objects.append(input_obj.FileInputObject(file_name)) for url in args.url: input_objects.append(input_obj.LinkInputObject(url)) if len(input_objects) == 0: print('Error; no input given') sys.exit(1) for iobj in input_objects: logger.info('Processing %s' % iobj) with iobj: data = iobj.text() tree = html.fromstring(data) tables = tree.xpath('//table') if len(tables) == 0: logger.error('Parsing problems, no tables given') continue for tbl_idx, table in enumerate(tables): rows = table[1] # tbody rows_cnt = len(rows) if rows_cnt < 2: logger.warning('Table %d has not enough rows: %d' % (tbl_idx, rows_cnt)) continue for row_idx, row in enumerate(rows): if row[0].tag != 'td': continue file_href = row[0][0].attrib['href'].strip() file_name = row[0][0].text_content().strip() file_hash = row[2].text_content().strip() file_size = row[3].text_content().strip() file_date = row[4].text_content().strip() if file_date not in datasets: dataset = collections.OrderedDict() dataset['id'] = dataset_idx dataset['date'] = file_date dataset['date_utc'] = utils.unix_time( datetime.strptime(file_date, '%Y-%m-%d')) dataset['files'] = collections.OrderedDict() datasets[file_date] = dataset dataset_idx += 1 else: dataset = datasets[file_date] link = Link(file_name, file_href, file_size, file_hash) dataset['files'][file_name] = link.to_json() js = collections.OrderedDict() js['generated'] = time.time() js['data'] = sorted([datasets[x] for x in datasets], key=lambda x: x['id']) print(json.dumps(js, indent=2))
def main(): """ Processing censys link page to the json https://censys.io/data/443-https-tls-alexa_top1mil/historical https://censys.io/data/443-https-tls-full_ipv4/historical :return: """ parser = argparse.ArgumentParser( description='Processes Censys links from the page, generates json') parser.add_argument('--url', dest='url', nargs=argparse.ZERO_OR_MORE, default=[], help='censys links') parser.add_argument('file', nargs=argparse.ZERO_OR_MORE, default=[], help='censys link file') args = parser.parse_args() dataset_idx = 10 datasets = [] input_objects = [] for file_name in args.file: input_objects.append(input_obj.FileInputObject(file_name)) for url in args.url: input_objects.append(input_obj.LinkInputObject(url)) if len(input_objects) == 0: print('Error; no input given') sys.exit(1) for iobj in input_objects: logger.info('Processing %s' % iobj) with iobj: data = iobj.text() tree = html.fromstring(data) tables = tree.xpath('//table') if len(tables) == 0: logger.error( 'Parsing problems, no tables given (probably not logged in)' ) continue for tbl_idx, table in enumerate(reversed(tables)): rows = table[0] rows_cnt = len(rows) if rows_cnt < 2: logger.warning('Table %d has not enough rows: %d' % (tbl_idx, rows_cnt)) continue prev_h2 = table.getprevious() header = prev_h2.text_content().strip() dataset = collections.OrderedDict() dataset['id'] = dataset_idx dataset['date'] = header dataset['date_utc'] = utils.unix_time( datetime.strptime(header, '%Y-%m-%d %H:%M:%S')) dataset['files'] = collections.OrderedDict() for row_idx, row in enumerate(rows): if row_idx == 0 or row[0].tag != 'td': continue file_href = row[0][0].attrib['href'].strip() file_code = row[0][0].attrib['download'].strip() file_name = row[0][0].text_content().strip() file_type = row[1].text_content().strip() file_size = row[2].text_content().strip() file_hash = row[3].text_content().strip() # logger.info('File %d %s %s %s %s %s %s' % (row_idx, file_href, file_code, file_name, file_type, file_size, file_hash)) link = Link(file_name, file_code, file_href, file_size, file_type, file_hash) dataset['files'][file_name] = link.to_json() if 'zgrab-results.json.lz4' not in dataset['files']: logger.warning('Zgrab result file not found in %d' % dataset_idx) logger.info( 'H: %s, files: %s' % (header, ' '.join([x for x in dataset['files']]))) datasets.append(dataset) dataset_idx += 1 js = collections.OrderedDict() js['generated'] = time.time() js['data'] = datasets print(json.dumps(js, indent=2))
def main(self): """ Processing censys sonar.ssl Recodes one big certificate file to smaller _certs.gz files as published since 2015 so we can process it in the same way. https://scans.io/study/sonar.ssl :return: """ parser = argparse.ArgumentParser( description='Recoding big sonarssl file to the incremental one') parser.add_argument('--url', dest='url', nargs=argparse.ZERO_OR_MORE, default=[], help='censys links') parser.add_argument('--json', dest='json', default=None, help='sonar links json') parser.add_argument('--datadir', dest='datadir', default='.', help='datadir') parser.add_argument('--fprint-only', dest='fprint_only', default=False, action='store_const', const=True, help='Only fprint gen') parser.add_argument('--base-only', dest='base_only', default=False, action='store_const', const=True, help='Chunk only one big dataset sample') parser.add_argument('file', nargs=argparse.ZERO_OR_MORE, default=[], help='censys link file') args = parser.parse_args() # Big in memory hash table fprint -> certificate bigdb = {} testrng = range(10, 93) if args.base_only else range(10, 181) # fprints seen fprints_seen_set = set() fprints_previous = set() if not args.fprint_only: if len(args.file) == 0: return main_file = args.file[0] self.load_cert_db(main_file, bigdb) jsdb = None with open(args.json, 'r') as fh: jsdb = json.load(fh) jsdb_ids = {x['id']: x for x in jsdb['data']} for test_idx in testrng: files = jsdb_ids[test_idx]['files'] filerec = None for tmprec in files: if '_hosts.gz' in tmprec: filerec = files[tmprec] break fname = filerec['name'] flink = filerec['href'] # 20131104/20131104_hosts.gz fname_2 = fname.split('/') if len(fname_2) == 2: fname_2 = fname_2[1] else: fname_2 = fname_2[0] dateparts = fname_2.split('_') datepart = dateparts[0] hostfile = os.path.join(args.datadir, '%s_hosts.gz' % datepart) certfile = os.path.join(args.datadir, '%s_certs.gz' % datepart) fprintfile = os.path.join(args.datadir, '%s_fprints.csv' % datepart) fprintfile_new = os.path.join(args.datadir, '%s_fprints_new.csv' % datepart) fprintfile_new_p = os.path.join(args.datadir, '%s_fprints_new_p.csv' % datepart) fprintfile_lost_p = os.path.join( args.datadir, '%s_fprints_lost_p.csv' % datepart) fprintfile_same = os.path.join(args.datadir, '%s_fprints_same.csv' % datepart) logger.info('Processing test idx %s, file %s, newfile: %s' % (test_idx, fname, certfile)) not_found = 0 fprints_set = set() fprints_set_new = set() iobj = None hosth = None if os.path.exists(hostfile): iobj = input_obj.FileInputObject(fname=hostfile) elif args.fprint_only: continue else: hosth = open(hostfile, 'wb') iobj = input_obj.ReconnectingLinkInputObject(url=flink, rec=files) iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True) # Reading host file, ip -> fprints associations with iobj: fh = gzipinputstream.GzipInputStream(fileobj=iobj) for rec_idx, rec in enumerate(fh): try: linerec = rec.strip().split(',') ip = linerec[0].strip() fprints = linerec[1:] for fprint in fprints: fprint = utils.strip_hex_prefix( fprint.strip()).lower() fprints_set.add(fprint) if rec_idx % 1000000 == 0: iobj.flush() logger.debug( ' .. progress %s, ip %s, mem: %s MB' % (rec_idx, ip, utils.get_mem_usage() / 1024.0)) except Exception as e: logger.error('Exception in processing rec %s: %s' % (rec_idx, e)) logger.debug(rec) logger.debug(traceback.format_exc()) fprints_len = len(fprints_set) logger.info('File processed, fprint db size: %d. Mem: %s MB' % (fprints_len, utils.get_mem_mb())) # Only fingerprints logger.info('Going to sort fprints...') fprints = list(fprints_set) fprints.sort() logger.info('fprints sorted. Storing fingerprints. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) # Store only new fingerprints, not seen before logger.info('Storing new fingerprints. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) with open(fprintfile_new, 'w') as outfh: for fprint in fprints: if fprint not in fprints_seen_set: outfh.write('%s\n' % fprint) fprints_set_new.add(fprint) fprints_seen_set.add(fprint) # Certificates new from previous logger.info('Storing new fingerprints from previous. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) with open(fprintfile_new_p, 'w') as outfh: for fprint in fprints: if fprint not in fprints_previous: outfh.write('%s\n' % fprint) # Certificates removed from previous logger.info('Storing lost fingerprints from previous. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) fprints_previous_list = list(fprints_previous) fprints_previous_list.sort() with open(fprintfile_lost_p, 'w') as outfh: for fprint in fprints_previous_list: if fprint not in fprints_set: outfh.write('%s\n' % fprint) # Certificates same as in the previous dataset logger.info('Storing same fingerprints as previous. Mem: %s MB' % (utils.get_mem_usage() / 1024.0)) with open(fprintfile_same, 'w') as outfh: for fprint in fprints: if fprint in fprints_previous: outfh.write('%s\n' % fprint) # Store only fingerprints contained in this set. with open(fprintfile, 'w') as outfh: for fprint in fprints: outfh.write('%s\n' % fprint) if args.fprint_only: fprints_previous = set(fprints_set) continue # Certificates file _certs.gz - only new certificates fprints_new = list(fprints_set_new) fprints_new.sort() fprints_len = len(fprints_new) fprints_progress_unit = fprints_len / 100 fprints_progress_last = 0 logger.info('Dumping only new certificates, fprint db size: %d' % fprints_len) with gzip.open(certfile, 'wb') as outfh: for rec_idx, fprint in enumerate(fprints_new): if fprints_progress_last + fprints_progress_unit < rec_idx: fprints_progress_last = rec_idx outfh.flush() logger.debug(' .. progress %s, mem: %s MB' % (rec_idx, utils.get_mem_usage() / 1024.0)) if fprint in bigdb: outfh.write('%s,%s\n' % (fprint, base64.b64encode(bigdb[fprint]))) else: not_found += 1 logger.info( 'Finished with idx %s, file %s, newfile: %s, not found: %s, mem: %s MB' % (test_idx, fname, certfile, not_found, utils.get_mem_usage() / 1024.0)) # Final step - store to previous fprints_previous = set(fprints_set)
def work(self): """ Processing :return: """ # Open the json link file args = self.args index_db = None with open(args.json, 'r') as fh: index_db = json.load(fh) self.index_db = index_db # Manage the raw_certificates file main_cert_rec = index_db['data'][1] main_cert_link = main_cert_rec['fhref'] main_cert_file = os.path.join(args.datadir, os.path.basename(main_cert_link)) json_res_file = os.path.join(args.datadir, 'eco.json') json_res_fh = open(json_res_file, 'w') iobj = None if os.path.exists(main_cert_file): iobj = input_obj.FileInputObject(fname=main_cert_file) elif os.path.exists(main_cert_file[:-3]): # ungziped main_cert_file = main_cert_file[:-3] iobj = input_obj.FileInputObject(fname=main_cert_file) else: logger.info('Going to download certificate file') hosth = open(main_cert_file, 'wb') iobj = input_obj.ReconnectingLinkInputObject(url=main_cert_link, rec=main_cert_rec) iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True) # Big in memory hash table fprint -> certificate bigdb = {} counter = 0 testrng = range(15, 171) # fprints seen fprints_seen_set = set() fprints_previous = set() # Process the main certificate file with iobj: fh = iobj if main_cert_file.endswith('.gz'): fh = gzipinputstream.GzipInputStream(fileobj=iobj) else: fh = NewlineIterator(iobj) for idx, line in enumerate(fh): try: csv = line.split(',') fprint = utils.strip_hex_prefix(csv[0].strip()) cert = utils.strip_hex_prefix(csv[1].strip()) certbin = base64.b16decode(cert, True) bigdb[fprint] = certbin counter += 1 if counter % 100000 == 0: logger.debug(' .. progress %s, fprint %s, memory: %s MB' % (counter, fprint, utils.get_mem_mb())) except Exception as e: logger.error('Error in processing %s' % e) logger.debug(traceback.format_exc()) logger.info('Uff... big DB loaded, num entries: %s' % len(bigdb)) # Load sequential scans # Host file association ip -> fingerprint jsdb_ids = {x['id']: x for x in self.index_db['data']} last_file_date_utc = 0 for test_idx in testrng: filerec = jsdb_ids[test_idx] fname = filerec['fname'] flink = filerec['fhref'] datepart = filerec['date'] date_utc = filerec['date_utc'] fname_2 = os.path.basename(fname) # As dataset is in a form of snapshots we can skip some time intervals. if self.args.space and date_utc - last_file_date_utc < (60*60*24*7 - 60*60): logger.info('Skipping record %d, as the time diff is too small from the previous one: %s' % (test_idx, date_utc - last_file_date_utc)) continue last_file_date_utc = date_utc hostfile = os.path.join(args.datadir, fname_2) certfile = os.path.join(args.datadir, '%s_certs.gz' % datepart) fprintfile_new = os.path.join(args.datadir, '%s_fprints_new.csv' % datepart) fprintfile_new_p = os.path.join(args.datadir, '%s_fprints_new_p.csv' % datepart) fprintfile_lost_p = os.path.join(args.datadir, '%s_fprints_lost_p.csv' % datepart) js_res_rec = collections.OrderedDict() js_res_rec['fname'] = fname js_res_rec['fhref'] = flink js_res_rec['date'] = datepart js_res_rec['date_utc'] = date_utc js_res_rec['hostfile'] = hostfile js_res_rec['certfile'] = certfile js_res_rec['fprintfile_new'] = fprintfile_new js_res_rec['rec'] = filerec logger.info('Processing test idx %s, file %s' % (test_idx, fname)) not_found = 0 fprints_set = set() fprints_set_new = set() iobj = None hosth = None # Open local or open remote (with download to local) if os.path.exists(hostfile): iobj = input_obj.FileInputObject(fname=hostfile) else: hosth = open(hostfile, 'wb') iobj = input_obj.ReconnectingLinkInputObject(url=flink, rec=filerec) iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True) # Processing ip -> fingerprints associations with iobj: fh = gzipinputstream.GzipInputStream(fileobj=iobj) for rec_idx, rec in enumerate(fh): try: linerec = rec.strip().split(',') ip = linerec[0].strip() fprint = utils.strip_hex_prefix(linerec[2].strip()) fprints_set.add(fprint) if rec_idx % 1000000 == 0: iobj.flush() logger.debug(' .. progress %s, ip %s, mem: %s MB' % (rec_idx, ip, utils.get_mem_mb())) except Exception as e: logger.error('Exception in processing rec %s: %s' % (rec_idx, e)) logger.debug(rec) logger.debug(traceback.format_exc()) fprints_len = len(fprints_set) logger.info('File processed, fprint db size: %d' % fprints_len) # Only fingerprints logger.info('Going to sort fprints...') fprints = list(fprints_set) fprints.sort() logger.info('fprints sorted. Storing fingerprints. Mem: %s MB' % (utils.get_mem_mb())) # Store only new fingerprints, not seen before logger.info('Storing new fingerprints. Mem: %s MB' % (utils.get_mem_mb())) with open(fprintfile_new, 'w') as outfh: for fprint in fprints: if fprint not in fprints_seen_set: outfh.write('%s\n' % fprint) fprints_set_new.add(fprint) fprints_seen_set.add(fprint) # Certificates new from previous logger.info('Storing new fingerprints from previous. Mem: %s MB' % (utils.get_mem_mb())) with open(fprintfile_new_p, 'w') as outfh: for fprint in fprints: if fprint not in fprints_previous: outfh.write('%s\n' % fprint) # Certificates removed from previous logger.info('Storing lost fingerprints from previous. Mem: %s MB' % (utils.get_mem_mb())) fprints_previous_list = list(fprints_previous) fprints_previous_list.sort() with open(fprintfile_lost_p, 'w') as outfh: for fprint in fprints_previous_list: if fprint not in fprints_set: outfh.write('%s\n' % fprint) # Certificates file _certs.gz - only new certificates, incremental records fprints_new = list(fprints_set_new) fprints_new.sort() fprints_len = len(fprints_new) fprints_progress_unit = fprints_len / 100 fprints_progress_last = 0 logger.info('Dumping only new certificates, fprint db size: %d' % fprints_len) with gzip.open(certfile, 'wb') as outfh: for rec_idx, fprint in enumerate(fprints_new): if fprints_progress_last + fprints_progress_unit < rec_idx: fprints_progress_last = rec_idx outfh.flush() logger.debug(' .. progress %s, mem: %s MB' % (rec_idx, utils.get_mem_mb())) if fprint in bigdb: outfh.write('%s,%s\n' % (fprint, base64.b64encode(bigdb[fprint]))) else: not_found += 1 logger.info('Finished with idx %s, file %s, newfile: %s, not found: %s, mem: %s MB' % (test_idx, fname, certfile, not_found, utils.get_mem_mb())) # Final step - store to previous fprints_previous = set(fprints_set) # Result file record flush json_res_fh.write(json.dumps(js_res_rec) + '\n') json_res_fh.flush() json_res_fh.close()