Ejemplo n.º 1
0
    def report(self):
        """
        pass
        :return: 
        """
        ct = time.time()
        if ct - self.state_last_dump < self.state_time_dump:
            return

        logger.debug(
            '.. rsa: %s, non-rsa: %s, errs: %s, nofpr: %s, nor: %s, exp: %s, found: %s, not CA:'
            ' %s, mem: %s MB, depth: %s, cfile: %s' %
            (self.num_rsa, self.num_non_rsa, self.num_errs,
             self.num_no_fprint_raw, self.num_no_raw,
             self.num_expired, self.num_found, self.num_not_ca,
             utils.get_mem_mb(), self.cur_depth, self.cur_file))
        self.state_last_dump = ct
Ejemplo n.º 2
0
    def load_cert_db(self, main_file, bigdb):
        """
        Loads big fprint -> certificate database to memory
        :param main_file: 
        :param bigdb: 
        :return: 
        """
        counter = 0
        # Open the main file, gziped or not
        if main_file.endswith('gz'):
            fh = gzip.open(main_file, 'rb')
        else:
            fh = open(main_file, 'rb')

        errors = 0
        with fh:
            for idx, line in enumerate(fh):
                try:
                    fprint, cert = line.split(',', 1)
                    cert = cert.strip()
                    fprint = utils.strip_hex_prefix(fprint.strip()).lower()

                    certbin = base64.b64decode(cert)
                    bigdb[fprint] = certbin
                    counter += 1

                    if counter % 10000 == 0:
                        logger.debug(
                            ' .. progress %s, fprint %s, memory: %s MB' %
                            (counter, fprint, utils.get_mem_usage() / 1024.0))

                except Exception as e:
                    errors += 1
                    logger.error('Error in processing %s' % e)
                    self.trace_logger.log(e)

        logger.info(
            'Uff... big DB loaded, num entries: %s, errors: %s, memory: %s MB'
            % (len(bigdb), errors, utils.get_mem_mb()))
Ejemplo n.º 3
0
    def process_iobj(self, iobj):
        """
        Processing
        :param iobj: 
        :return: 
        """
        input_name = self.iobj_name(iobj)
        logger.info('Processing: %s' % input_name)

        finish_file = self.get_finish_file(input_name)
        if os.path.exists(finish_file):
            logger.info('Finish indicator file exists, skipping: %s' %
                        finish_file)
            return

        self.cur_decompressor = None
        self.cur_state_file = self.get_state_file(input_name)
        file_leafs = self.get_classification_leafs(input_name)
        file_roots = self.get_classification_roots(input_name)
        self.last_record_resumed = None

        self.processor = newline_reader.NewlineReader(is_json=False)
        handle = iobj
        name = str(iobj)

        if name.endswith('lz4'):
            self.cur_decompressor = lz4framed.Decompressor(handle)
            handle = self.cur_decompressor

        if not self.is_dry() and (not self.args.continue1
                                  or not os.path.exists(file_leafs)
                                  or not os.path.exists(file_roots)):
            utils.safely_remove(file_leafs)
            utils.safely_remove(file_roots)
            self.file_leafs_fh = utils.safe_open(file_leafs,
                                                 mode='w',
                                                 chmod=0o644)
            self.file_roots_fh = utils.safe_open(file_roots,
                                                 mode='w',
                                                 chmod=0o644)

        elif self.args.continue1:
            logger.info('Continuing with the started files')
            self.file_leafs_fh = open(file_leafs,
                                      mode='r+' if not self.is_dry() else 'r')
            self.file_roots_fh = open(file_roots,
                                      mode='r+' if not self.is_dry() else 'r')
            self.restore_checkpoint(iobj)
            self.continue_roots()
            self.continue_leafs(file_leafs)

        with iobj:
            resume_token_found = False
            resume_token = None
            resume_idx = 0
            record_ctr = -1
            already_processed = 0
            read_start = self.read_data
            for idx, record in self.processor.process(handle):
                try:
                    record_ctr += 1
                    self.read_data += len(record)

                    # Check the checkpoint distance + boundary - process all newline chunks available
                    if self.read_data - self.last_report >= 1024 * 1024 * 1024 and self.processor.step_cur_last_element:
                        logger.info(
                            '...progress: %s GB, idx: %s, pos: %s GB, mem: %04.8f MB, readpos: %s (%4.6f GB)'
                            % (self.read_data / 1024.0 / 1024.0 / 1024.0, idx,
                               self.read_data, utils.get_mem_mb(), iobj.tell(),
                               iobj.tell() / 1024.0 / 1024.0 / 1024.0))

                        self.last_report = self.read_data
                        self.try_store_checkpoint(iobj=iobj,
                                                  idx=idx,
                                                  resume_idx=resume_idx,
                                                  resume_token=resume_token)

                        # Flush already seen IP database, not needed anymore
                        # we are too far from the resumed checkpoint
                        if read_start + 1024 * 1024 * 1024 * 2 > self.read_data:
                            self.state_loaded_ips = set()

                    js = json.loads(record)

                    # If there are more records after the last checkpoint load, skip duplicates
                    if js['ip'] in self.state_loaded_ips:
                        already_processed += 1
                        continue

                    self.process_record(idx, js)

                except Exception as e:
                    logger.error('Exception in processing %d: %s' %
                                 (self.ctr, e))
                    logger.debug(traceback.format_exc())
                    logger.debug(record)

                self.ctr += 1

            logger.info('Total: %d' % self.ctr)
            logger.info('Total_chain: %d' % self.chain_ctr)
            logger.info('Not tls: %d' % self.not_tls)
            logger.info('Not cert ok: %d' % self.not_cert_ok)
            logger.info('Not chain ok: %d' % self.not_chain_ok)
            logger.info('Not parsed: %d' % self.not_parsed)
            logger.info('Not rsa: %d' % self.not_rsa)

        logger.info('Processed: %s' % iobj)
        if not self.is_dry():
            self.file_leafs_fh.close()
            self.file_roots_fh.close()
            utils.try_touch(finish_file)
Ejemplo n.º 4
0
    def main(self):
        """
        Processing censys sonar.ssl
        Recodes one big certificate file to smaller _certs.gz files as published since 2015
        so we can process it in the same way.
        
        https://scans.io/study/sonar.ssl
        :return:
        """
        parser = argparse.ArgumentParser(
            description='Recoding big sonarssl file to the incremental one')

        parser.add_argument('--url',
                            dest='url',
                            nargs=argparse.ZERO_OR_MORE,
                            default=[],
                            help='censys links')

        parser.add_argument('--json',
                            dest='json',
                            default=None,
                            help='sonar links json')

        parser.add_argument('--datadir',
                            dest='datadir',
                            default='.',
                            help='datadir')

        parser.add_argument('--fprint-only',
                            dest='fprint_only',
                            default=False,
                            action='store_const',
                            const=True,
                            help='Only fprint gen')

        parser.add_argument('--base-only',
                            dest='base_only',
                            default=False,
                            action='store_const',
                            const=True,
                            help='Chunk only one big dataset sample')

        parser.add_argument('file',
                            nargs=argparse.ZERO_OR_MORE,
                            default=[],
                            help='censys link file')

        args = parser.parse_args()

        # Big in memory hash table fprint -> certificate
        bigdb = {}
        testrng = range(10, 93) if args.base_only else range(10, 181)

        # fprints seen
        fprints_seen_set = set()
        fprints_previous = set()

        if not args.fprint_only:
            if len(args.file) == 0:
                return
            main_file = args.file[0]
            self.load_cert_db(main_file, bigdb)

        jsdb = None
        with open(args.json, 'r') as fh:
            jsdb = json.load(fh)

        jsdb_ids = {x['id']: x for x in jsdb['data']}
        for test_idx in testrng:
            files = jsdb_ids[test_idx]['files']
            filerec = None
            for tmprec in files:
                if '_hosts.gz' in tmprec:
                    filerec = files[tmprec]
                    break

            fname = filerec['name']
            flink = filerec['href']

            # 20131104/20131104_hosts.gz
            fname_2 = fname.split('/')
            if len(fname_2) == 2:
                fname_2 = fname_2[1]
            else:
                fname_2 = fname_2[0]

            dateparts = fname_2.split('_')
            datepart = dateparts[0]

            hostfile = os.path.join(args.datadir, '%s_hosts.gz' % datepart)
            certfile = os.path.join(args.datadir, '%s_certs.gz' % datepart)
            fprintfile = os.path.join(args.datadir,
                                      '%s_fprints.csv' % datepart)
            fprintfile_new = os.path.join(args.datadir,
                                          '%s_fprints_new.csv' % datepart)
            fprintfile_new_p = os.path.join(args.datadir,
                                            '%s_fprints_new_p.csv' % datepart)
            fprintfile_lost_p = os.path.join(
                args.datadir, '%s_fprints_lost_p.csv' % datepart)
            fprintfile_same = os.path.join(args.datadir,
                                           '%s_fprints_same.csv' % datepart)
            logger.info('Processing test idx %s, file %s, newfile: %s' %
                        (test_idx, fname, certfile))

            not_found = 0
            fprints_set = set()
            fprints_set_new = set()
            iobj = None
            hosth = None

            if os.path.exists(hostfile):
                iobj = input_obj.FileInputObject(fname=hostfile)

            elif args.fprint_only:
                continue

            else:
                hosth = open(hostfile, 'wb')
                iobj = input_obj.ReconnectingLinkInputObject(url=flink,
                                                             rec=files)
                iobj = input_obj.TeeInputObject(parent_fh=iobj,
                                                copy_fh=hosth,
                                                close_copy_on_exit=True)

            # Reading host file, ip -> fprints associations
            with iobj:
                fh = gzipinputstream.GzipInputStream(fileobj=iobj)
                for rec_idx, rec in enumerate(fh):
                    try:

                        linerec = rec.strip().split(',')
                        ip = linerec[0].strip()
                        fprints = linerec[1:]
                        for fprint in fprints:
                            fprint = utils.strip_hex_prefix(
                                fprint.strip()).lower()
                            fprints_set.add(fprint)

                        if rec_idx % 1000000 == 0:
                            iobj.flush()
                            logger.debug(
                                ' .. progress %s, ip %s, mem: %s MB' %
                                (rec_idx, ip, utils.get_mem_usage() / 1024.0))

                    except Exception as e:
                        logger.error('Exception in processing rec %s: %s' %
                                     (rec_idx, e))
                        logger.debug(rec)
                        logger.debug(traceback.format_exc())

            fprints_len = len(fprints_set)
            logger.info('File processed, fprint db size: %d. Mem: %s MB' %
                        (fprints_len, utils.get_mem_mb()))

            # Only fingerprints
            logger.info('Going to sort fprints...')
            fprints = list(fprints_set)
            fprints.sort()
            logger.info('fprints sorted. Storing fingerprints. Mem: %s MB' %
                        (utils.get_mem_usage() / 1024.0))

            # Store only new fingerprints, not seen before
            logger.info('Storing new fingerprints. Mem: %s MB' %
                        (utils.get_mem_usage() / 1024.0))
            with open(fprintfile_new, 'w') as outfh:
                for fprint in fprints:
                    if fprint not in fprints_seen_set:
                        outfh.write('%s\n' % fprint)
                        fprints_set_new.add(fprint)
                        fprints_seen_set.add(fprint)

            # Certificates new from previous
            logger.info('Storing new fingerprints from previous. Mem: %s MB' %
                        (utils.get_mem_usage() / 1024.0))
            with open(fprintfile_new_p, 'w') as outfh:
                for fprint in fprints:
                    if fprint not in fprints_previous:
                        outfh.write('%s\n' % fprint)

            # Certificates removed from previous
            logger.info('Storing lost fingerprints from previous. Mem: %s MB' %
                        (utils.get_mem_usage() / 1024.0))
            fprints_previous_list = list(fprints_previous)
            fprints_previous_list.sort()

            with open(fprintfile_lost_p, 'w') as outfh:
                for fprint in fprints_previous_list:
                    if fprint not in fprints_set:
                        outfh.write('%s\n' % fprint)

            # Certificates same as in the previous dataset
            logger.info('Storing same fingerprints as previous. Mem: %s MB' %
                        (utils.get_mem_usage() / 1024.0))
            with open(fprintfile_same, 'w') as outfh:
                for fprint in fprints:
                    if fprint in fprints_previous:
                        outfh.write('%s\n' % fprint)

            # Store only fingerprints contained in this set.
            with open(fprintfile, 'w') as outfh:
                for fprint in fprints:
                    outfh.write('%s\n' % fprint)

            if args.fprint_only:
                fprints_previous = set(fprints_set)
                continue

            # Certificates file _certs.gz - only new certificates
            fprints_new = list(fprints_set_new)
            fprints_new.sort()

            fprints_len = len(fprints_new)
            fprints_progress_unit = fprints_len / 100
            fprints_progress_last = 0
            logger.info('Dumping only new certificates, fprint db size: %d' %
                        fprints_len)

            with gzip.open(certfile, 'wb') as outfh:
                for rec_idx, fprint in enumerate(fprints_new):

                    if fprints_progress_last + fprints_progress_unit < rec_idx:
                        fprints_progress_last = rec_idx
                        outfh.flush()
                        logger.debug(' .. progress %s, mem: %s MB' %
                                     (rec_idx, utils.get_mem_usage() / 1024.0))

                    if fprint in bigdb:
                        outfh.write('%s,%s\n' %
                                    (fprint, base64.b64encode(bigdb[fprint])))

                    else:
                        not_found += 1

            logger.info(
                'Finished with idx %s, file %s, newfile: %s, not found: %s, mem: %s MB'
                % (test_idx, fname, certfile, not_found,
                   utils.get_mem_usage() / 1024.0))

            # Final step - store to previous
            fprints_previous = set(fprints_set)
Ejemplo n.º 5
0
def main():
    # Parse command line arguments
    parser = argparse.ArgumentParser(description='IP addr diff')

    parser.add_argument('--datadir',
                        dest='datadir',
                        default='.',
                        help='datadir')

    parser.add_argument('--test', dest='test', default='', help='test name')

    parser.add_argument('files',
                        nargs=argparse.ZERO_OR_MORE,
                        default=[],
                        help='files')

    args = parser.parse_args()

    ip_first = set()
    ip_second = set()

    if len(args.files) < 2:
        logger.error('Has to have at least 2 files')
        return

    logger.info('Loading first file...')
    load_set(args.files[0], ip_first)
    logger.info('File loaded, #of ip addresses: %s, mem: %s MB' %
                (len(ip_first), utils.get_mem_mb()))

    logger.info('Loading second file...')
    load_set(args.files[1], ip_second)
    logger.info('File loaded, #of ip addresses: %s, mem: %s MB' %
                (len(ip_second), utils.get_mem_mb()))

    path_ab = os.path.join(args.datadir, '%s_a_min_b.csv' % args.test)
    path_ba = os.path.join(args.datadir, '%s_b_min_a.csv' % args.test)
    path_sym = os.path.join(args.datadir, '%s_a_sym_b.csv' % args.test)
    path_int = os.path.join(args.datadir,
                            '%s_a_intersection_b.csv' % args.test)
    path_uni = os.path.join(args.datadir, '%s_a_union_b.csv' % args.test)

    logger.info('Dumping a - b')
    with open(path_ab, 'w') as fh:
        res_set = sorted(list(ip_first - ip_second))
        for x in res_set:
            fh.write('%s\n' % x)

    logger.info('Dumping b - a')
    with open(path_ba, 'w') as fh:
        res_set = sorted(list(ip_second - ip_first))
        for x in res_set:
            fh.write('%s\n' % x)

    logger.info('Dumping a ^ b')
    with open(path_sym, 'w') as fh:
        res_set = sorted(list(ip_first ^ ip_second))
        for x in res_set:
            fh.write('%s\n' % x)

    logger.info('Dumping a & b')
    with open(path_int, 'w') as fh:
        res_set = sorted(list(ip_first & ip_second))
        for x in res_set:
            fh.write('%s\n' % x)

    logger.info('Dumping a | b')
    with open(path_uni, 'w') as fh:
        res_set = sorted(list(ip_first | ip_second))
        for x in res_set:
            fh.write('%s\n' % x)
Ejemplo n.º 6
0
    def work(self):
        """
        Processing
        :return: 
        """

        # Open the json link file
        args = self.args
        index_db = None
        with open(args.json, 'r') as fh:
            index_db = json.load(fh)
        self.index_db = index_db

        # Manage the raw_certificates file
        main_cert_rec = index_db['data'][1]
        main_cert_link = main_cert_rec['fhref']
        main_cert_file = os.path.join(args.datadir, os.path.basename(main_cert_link))
        json_res_file = os.path.join(args.datadir, 'eco.json')
        json_res_fh = open(json_res_file, 'w')

        iobj = None
        if os.path.exists(main_cert_file):
            iobj = input_obj.FileInputObject(fname=main_cert_file)

        elif os.path.exists(main_cert_file[:-3]):  # ungziped
            main_cert_file = main_cert_file[:-3]
            iobj = input_obj.FileInputObject(fname=main_cert_file)

        else:
            logger.info('Going to download certificate file')
            hosth = open(main_cert_file, 'wb')
            iobj = input_obj.ReconnectingLinkInputObject(url=main_cert_link, rec=main_cert_rec)
            iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True)

        # Big in memory hash table fprint -> certificate
        bigdb = {}
        counter = 0
        testrng = range(15, 171)

        # fprints seen
        fprints_seen_set = set()
        fprints_previous = set()

        # Process the main certificate file
        with iobj:
            fh = iobj
            if main_cert_file.endswith('.gz'):
                fh = gzipinputstream.GzipInputStream(fileobj=iobj)
            else:
                fh = NewlineIterator(iobj)

            for idx, line in enumerate(fh):
                try:
                    csv = line.split(',')
                    fprint = utils.strip_hex_prefix(csv[0].strip())
                    cert = utils.strip_hex_prefix(csv[1].strip())

                    certbin = base64.b16decode(cert, True)
                    bigdb[fprint] = certbin
                    counter += 1

                    if counter % 100000 == 0:
                        logger.debug(' .. progress %s, fprint %s, memory: %s MB'
                                     % (counter, fprint, utils.get_mem_mb()))

                except Exception as e:
                    logger.error('Error in processing %s' % e)
                    logger.debug(traceback.format_exc())

        logger.info('Uff... big DB loaded, num entries: %s' % len(bigdb))

        # Load sequential scans
        # Host file association ip -> fingerprint
        jsdb_ids = {x['id']: x for x in self.index_db['data']}
        last_file_date_utc = 0
        for test_idx in testrng:
            filerec = jsdb_ids[test_idx]
            fname = filerec['fname']
            flink = filerec['fhref']
            datepart = filerec['date']
            date_utc = filerec['date_utc']
            fname_2 = os.path.basename(fname)

            # As dataset is in a form of snapshots we can skip some time intervals.
            if self.args.space and date_utc - last_file_date_utc < (60*60*24*7 - 60*60):
                logger.info('Skipping record %d, as the time diff is too small from the previous one: %s'
                            % (test_idx, date_utc - last_file_date_utc))
                continue

            last_file_date_utc = date_utc
            hostfile = os.path.join(args.datadir, fname_2)
            certfile = os.path.join(args.datadir, '%s_certs.gz' % datepart)
            fprintfile_new = os.path.join(args.datadir, '%s_fprints_new.csv' % datepart)
            fprintfile_new_p = os.path.join(args.datadir, '%s_fprints_new_p.csv' % datepart)
            fprintfile_lost_p = os.path.join(args.datadir, '%s_fprints_lost_p.csv' % datepart)

            js_res_rec = collections.OrderedDict()
            js_res_rec['fname'] = fname
            js_res_rec['fhref'] = flink
            js_res_rec['date'] = datepart
            js_res_rec['date_utc'] = date_utc
            js_res_rec['hostfile'] = hostfile
            js_res_rec['certfile'] = certfile
            js_res_rec['fprintfile_new'] = fprintfile_new
            js_res_rec['rec'] = filerec

            logger.info('Processing test idx %s, file %s' % (test_idx, fname))

            not_found = 0
            fprints_set = set()
            fprints_set_new = set()
            iobj = None
            hosth = None

            # Open local or open remote (with download to local)
            if os.path.exists(hostfile):
                iobj = input_obj.FileInputObject(fname=hostfile)
            else:
                hosth = open(hostfile, 'wb')
                iobj = input_obj.ReconnectingLinkInputObject(url=flink, rec=filerec)
                iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True)

            # Processing ip -> fingerprints associations
            with iobj:
                fh = gzipinputstream.GzipInputStream(fileobj=iobj)
                for rec_idx, rec in enumerate(fh):
                    try:
                        linerec = rec.strip().split(',')
                        ip = linerec[0].strip()
                        fprint = utils.strip_hex_prefix(linerec[2].strip())
                        fprints_set.add(fprint)

                        if rec_idx % 1000000 == 0:
                            iobj.flush()
                            logger.debug(' .. progress %s, ip %s, mem: %s MB'
                                         % (rec_idx, ip, utils.get_mem_mb()))

                    except Exception as e:
                        logger.error('Exception in processing rec %s: %s' % (rec_idx, e))
                        logger.debug(rec)
                        logger.debug(traceback.format_exc())

            fprints_len = len(fprints_set)
            logger.info('File processed, fprint db size: %d' % fprints_len)

            # Only fingerprints
            logger.info('Going to sort fprints...')
            fprints = list(fprints_set)
            fprints.sort()
            logger.info('fprints sorted. Storing fingerprints. Mem: %s MB' % (utils.get_mem_mb()))

            # Store only new fingerprints, not seen before
            logger.info('Storing new fingerprints. Mem: %s MB' % (utils.get_mem_mb()))
            with open(fprintfile_new, 'w') as outfh:
                for fprint in fprints:
                    if fprint not in fprints_seen_set:
                        outfh.write('%s\n' % fprint)
                        fprints_set_new.add(fprint)
                        fprints_seen_set.add(fprint)

            # Certificates new from previous
            logger.info('Storing new fingerprints from previous. Mem: %s MB' % (utils.get_mem_mb()))
            with open(fprintfile_new_p, 'w') as outfh:
                for fprint in fprints:
                    if fprint not in fprints_previous:
                        outfh.write('%s\n' % fprint)

            # Certificates removed from previous
            logger.info('Storing lost fingerprints from previous. Mem: %s MB' % (utils.get_mem_mb()))
            fprints_previous_list = list(fprints_previous)
            fprints_previous_list.sort()

            with open(fprintfile_lost_p, 'w') as outfh:
                for fprint in fprints_previous_list:
                    if fprint not in fprints_set:
                        outfh.write('%s\n' % fprint)

            # Certificates file _certs.gz - only new certificates, incremental records
            fprints_new = list(fprints_set_new)
            fprints_new.sort()

            fprints_len = len(fprints_new)
            fprints_progress_unit = fprints_len / 100
            fprints_progress_last = 0
            logger.info('Dumping only new certificates, fprint db size: %d' % fprints_len)

            with gzip.open(certfile, 'wb') as outfh:
                for rec_idx, fprint in enumerate(fprints_new):

                    if fprints_progress_last + fprints_progress_unit < rec_idx:
                        fprints_progress_last = rec_idx
                        outfh.flush()
                        logger.debug(' .. progress %s, mem: %s MB'
                                     % (rec_idx, utils.get_mem_mb()))

                    if fprint in bigdb:
                        outfh.write('%s,%s\n' % (fprint, base64.b64encode(bigdb[fprint])))

                    else:
                        not_found += 1

            logger.info('Finished with idx %s, file %s, newfile: %s, not found: %s, mem: %s MB'
                        % (test_idx, fname, certfile, not_found, utils.get_mem_mb()))

            # Final step - store to previous
            fprints_previous = set(fprints_set)

            # Result file record flush
            json_res_fh.write(json.dumps(js_res_rec) + '\n')
            json_res_fh.flush()

        json_res_fh.close()