Esempio n. 1
0
    def generate_workset(self):
        """
        Prepares input objects for processing
        :return: 
        """
        # Build input objects
        for file_name in self.args.file:
            iobj = input_obj.FileInputObject(file_name, rec=None)
            self.input_objects.append(iobj)

        for url in self.args.url:
            iobj = self._build_link_object(url=url, rec=None)
            self.input_objects.append(iobj)

        link_indices = None
        if len(self.args.link_idx) > 0:
            link_indices = set([int(x) for x in self.args.link_idx])

        for link_file in self.args.link_file:
            with open(link_file, 'r') as fh:
                data = fh.read()
                js = json.loads(data)
                datasets = js['data']

            for dataset in datasets:
                did = dataset['id']
                if link_indices is not None and did not in link_indices:
                    continue

                iobj = self._build_link_object(
                    url=dataset['files']['zgrab-results.json.lz4']['href'],
                    rec=dataset)
                self.input_objects.append(iobj)
Esempio n. 2
0
    def work(self):
        """
        Entry point after argument processing.
        :return: 
        """
        iobjs = []

        for url in self.args.url:
            iobjs.append(input_obj.ReconnectingLinkInputObject(url=url))

        for file in self.args.file:
            iobjs.append(input_obj.FileInputObject(file))

        for iobj in iobjs:
            self.process(iobj)
def main():
    """
    Processing censys link page to the json
    https://scans.io/study/sonar.ssl
    :return:
    """
    parser = argparse.ArgumentParser(
        description='Processes SonarSSL links from the page, generates json')

    parser.add_argument('--url',
                        dest='url',
                        nargs=argparse.ZERO_OR_MORE,
                        default=[],
                        help='censys links')

    parser.add_argument('file',
                        nargs=argparse.ZERO_OR_MORE,
                        default=[],
                        help='censys link file')

    args = parser.parse_args()

    # Process the input

    dataset_idx = 10
    datasets = {}

    input_objects = []
    for file_name in args.file:
        input_objects.append(input_obj.FileInputObject(file_name))
    for url in args.url:
        input_objects.append(input_obj.LinkInputObject(url))

    if len(input_objects) == 0:
        print('Error; no input given')
        sys.exit(1)

    for iobj in input_objects:
        logger.info('Processing %s' % iobj)

        with iobj:
            data = iobj.text()
            tree = html.fromstring(data)
            tables = tree.xpath('//table')

            if len(tables) == 0:
                logger.error('Parsing problems, no tables given')
                continue

            for tbl_idx, table in enumerate(tables):
                rows = table[1]  # tbody
                rows_cnt = len(rows)
                if rows_cnt < 2:
                    logger.warning('Table %d has not enough rows: %d' %
                                   (tbl_idx, rows_cnt))
                    continue

                for row_idx, row in enumerate(rows):
                    if row[0].tag != 'td':
                        continue

                    file_href = row[0][0].attrib['href'].strip()
                    file_name = row[0][0].text_content().strip()
                    file_hash = row[2].text_content().strip()
                    file_size = row[3].text_content().strip()
                    file_date = row[4].text_content().strip()

                    if file_date not in datasets:
                        dataset = collections.OrderedDict()
                        dataset['id'] = dataset_idx
                        dataset['date'] = file_date
                        dataset['date_utc'] = utils.unix_time(
                            datetime.strptime(file_date, '%Y-%m-%d'))
                        dataset['files'] = collections.OrderedDict()
                        datasets[file_date] = dataset
                        dataset_idx += 1
                    else:
                        dataset = datasets[file_date]

                    link = Link(file_name, file_href, file_size, file_hash)
                    dataset['files'][file_name] = link.to_json()

    js = collections.OrderedDict()
    js['generated'] = time.time()
    js['data'] = sorted([datasets[x] for x in datasets], key=lambda x: x['id'])
    print(json.dumps(js, indent=2))
Esempio n. 4
0
def main():
    """
    Processing censys link page to the json
    https://censys.io/data/443-https-tls-alexa_top1mil/historical
    https://censys.io/data/443-https-tls-full_ipv4/historical
    :return:
    """
    parser = argparse.ArgumentParser(
        description='Processes Censys links from the page, generates json')

    parser.add_argument('--url',
                        dest='url',
                        nargs=argparse.ZERO_OR_MORE,
                        default=[],
                        help='censys links')

    parser.add_argument('file',
                        nargs=argparse.ZERO_OR_MORE,
                        default=[],
                        help='censys link file')

    args = parser.parse_args()

    dataset_idx = 10
    datasets = []

    input_objects = []
    for file_name in args.file:
        input_objects.append(input_obj.FileInputObject(file_name))
    for url in args.url:
        input_objects.append(input_obj.LinkInputObject(url))

    if len(input_objects) == 0:
        print('Error; no input given')
        sys.exit(1)

    for iobj in input_objects:
        logger.info('Processing %s' % iobj)

        with iobj:
            data = iobj.text()
            tree = html.fromstring(data)
            tables = tree.xpath('//table')

            if len(tables) == 0:
                logger.error(
                    'Parsing problems, no tables given (probably not logged in)'
                )
                continue

            for tbl_idx, table in enumerate(reversed(tables)):
                rows = table[0]
                rows_cnt = len(rows)
                if rows_cnt < 2:
                    logger.warning('Table %d has not enough rows: %d' %
                                   (tbl_idx, rows_cnt))
                    continue

                prev_h2 = table.getprevious()
                header = prev_h2.text_content().strip()

                dataset = collections.OrderedDict()
                dataset['id'] = dataset_idx
                dataset['date'] = header
                dataset['date_utc'] = utils.unix_time(
                    datetime.strptime(header, '%Y-%m-%d %H:%M:%S'))
                dataset['files'] = collections.OrderedDict()
                for row_idx, row in enumerate(rows):
                    if row_idx == 0 or row[0].tag != 'td':
                        continue

                    file_href = row[0][0].attrib['href'].strip()
                    file_code = row[0][0].attrib['download'].strip()
                    file_name = row[0][0].text_content().strip()

                    file_type = row[1].text_content().strip()
                    file_size = row[2].text_content().strip()
                    file_hash = row[3].text_content().strip()
                    # logger.info('File %d %s %s %s %s %s %s' % (row_idx, file_href, file_code, file_name, file_type, file_size, file_hash))

                    link = Link(file_name, file_code, file_href, file_size,
                                file_type, file_hash)
                    dataset['files'][file_name] = link.to_json()

                if 'zgrab-results.json.lz4' not in dataset['files']:
                    logger.warning('Zgrab result file not found in %d' %
                                   dataset_idx)
                    logger.info(
                        'H: %s, files: %s' %
                        (header, ' '.join([x for x in dataset['files']])))

                datasets.append(dataset)
                dataset_idx += 1

    js = collections.OrderedDict()
    js['generated'] = time.time()
    js['data'] = datasets
    print(json.dumps(js, indent=2))
Esempio n. 5
0
    def main(self):
        """
        Processing censys sonar.ssl
        Recodes one big certificate file to smaller _certs.gz files as published since 2015
        so we can process it in the same way.
        
        https://scans.io/study/sonar.ssl
        :return:
        """
        parser = argparse.ArgumentParser(
            description='Recoding big sonarssl file to the incremental one')

        parser.add_argument('--url',
                            dest='url',
                            nargs=argparse.ZERO_OR_MORE,
                            default=[],
                            help='censys links')

        parser.add_argument('--json',
                            dest='json',
                            default=None,
                            help='sonar links json')

        parser.add_argument('--datadir',
                            dest='datadir',
                            default='.',
                            help='datadir')

        parser.add_argument('--fprint-only',
                            dest='fprint_only',
                            default=False,
                            action='store_const',
                            const=True,
                            help='Only fprint gen')

        parser.add_argument('--base-only',
                            dest='base_only',
                            default=False,
                            action='store_const',
                            const=True,
                            help='Chunk only one big dataset sample')

        parser.add_argument('file',
                            nargs=argparse.ZERO_OR_MORE,
                            default=[],
                            help='censys link file')

        args = parser.parse_args()

        # Big in memory hash table fprint -> certificate
        bigdb = {}
        testrng = range(10, 93) if args.base_only else range(10, 181)

        # fprints seen
        fprints_seen_set = set()
        fprints_previous = set()

        if not args.fprint_only:
            if len(args.file) == 0:
                return
            main_file = args.file[0]
            self.load_cert_db(main_file, bigdb)

        jsdb = None
        with open(args.json, 'r') as fh:
            jsdb = json.load(fh)

        jsdb_ids = {x['id']: x for x in jsdb['data']}
        for test_idx in testrng:
            files = jsdb_ids[test_idx]['files']
            filerec = None
            for tmprec in files:
                if '_hosts.gz' in tmprec:
                    filerec = files[tmprec]
                    break

            fname = filerec['name']
            flink = filerec['href']

            # 20131104/20131104_hosts.gz
            fname_2 = fname.split('/')
            if len(fname_2) == 2:
                fname_2 = fname_2[1]
            else:
                fname_2 = fname_2[0]

            dateparts = fname_2.split('_')
            datepart = dateparts[0]

            hostfile = os.path.join(args.datadir, '%s_hosts.gz' % datepart)
            certfile = os.path.join(args.datadir, '%s_certs.gz' % datepart)
            fprintfile = os.path.join(args.datadir,
                                      '%s_fprints.csv' % datepart)
            fprintfile_new = os.path.join(args.datadir,
                                          '%s_fprints_new.csv' % datepart)
            fprintfile_new_p = os.path.join(args.datadir,
                                            '%s_fprints_new_p.csv' % datepart)
            fprintfile_lost_p = os.path.join(
                args.datadir, '%s_fprints_lost_p.csv' % datepart)
            fprintfile_same = os.path.join(args.datadir,
                                           '%s_fprints_same.csv' % datepart)
            logger.info('Processing test idx %s, file %s, newfile: %s' %
                        (test_idx, fname, certfile))

            not_found = 0
            fprints_set = set()
            fprints_set_new = set()
            iobj = None
            hosth = None

            if os.path.exists(hostfile):
                iobj = input_obj.FileInputObject(fname=hostfile)

            elif args.fprint_only:
                continue

            else:
                hosth = open(hostfile, 'wb')
                iobj = input_obj.ReconnectingLinkInputObject(url=flink,
                                                             rec=files)
                iobj = input_obj.TeeInputObject(parent_fh=iobj,
                                                copy_fh=hosth,
                                                close_copy_on_exit=True)

            # Reading host file, ip -> fprints associations
            with iobj:
                fh = gzipinputstream.GzipInputStream(fileobj=iobj)
                for rec_idx, rec in enumerate(fh):
                    try:

                        linerec = rec.strip().split(',')
                        ip = linerec[0].strip()
                        fprints = linerec[1:]
                        for fprint in fprints:
                            fprint = utils.strip_hex_prefix(
                                fprint.strip()).lower()
                            fprints_set.add(fprint)

                        if rec_idx % 1000000 == 0:
                            iobj.flush()
                            logger.debug(
                                ' .. progress %s, ip %s, mem: %s MB' %
                                (rec_idx, ip, utils.get_mem_usage() / 1024.0))

                    except Exception as e:
                        logger.error('Exception in processing rec %s: %s' %
                                     (rec_idx, e))
                        logger.debug(rec)
                        logger.debug(traceback.format_exc())

            fprints_len = len(fprints_set)
            logger.info('File processed, fprint db size: %d. Mem: %s MB' %
                        (fprints_len, utils.get_mem_mb()))

            # Only fingerprints
            logger.info('Going to sort fprints...')
            fprints = list(fprints_set)
            fprints.sort()
            logger.info('fprints sorted. Storing fingerprints. Mem: %s MB' %
                        (utils.get_mem_usage() / 1024.0))

            # Store only new fingerprints, not seen before
            logger.info('Storing new fingerprints. Mem: %s MB' %
                        (utils.get_mem_usage() / 1024.0))
            with open(fprintfile_new, 'w') as outfh:
                for fprint in fprints:
                    if fprint not in fprints_seen_set:
                        outfh.write('%s\n' % fprint)
                        fprints_set_new.add(fprint)
                        fprints_seen_set.add(fprint)

            # Certificates new from previous
            logger.info('Storing new fingerprints from previous. Mem: %s MB' %
                        (utils.get_mem_usage() / 1024.0))
            with open(fprintfile_new_p, 'w') as outfh:
                for fprint in fprints:
                    if fprint not in fprints_previous:
                        outfh.write('%s\n' % fprint)

            # Certificates removed from previous
            logger.info('Storing lost fingerprints from previous. Mem: %s MB' %
                        (utils.get_mem_usage() / 1024.0))
            fprints_previous_list = list(fprints_previous)
            fprints_previous_list.sort()

            with open(fprintfile_lost_p, 'w') as outfh:
                for fprint in fprints_previous_list:
                    if fprint not in fprints_set:
                        outfh.write('%s\n' % fprint)

            # Certificates same as in the previous dataset
            logger.info('Storing same fingerprints as previous. Mem: %s MB' %
                        (utils.get_mem_usage() / 1024.0))
            with open(fprintfile_same, 'w') as outfh:
                for fprint in fprints:
                    if fprint in fprints_previous:
                        outfh.write('%s\n' % fprint)

            # Store only fingerprints contained in this set.
            with open(fprintfile, 'w') as outfh:
                for fprint in fprints:
                    outfh.write('%s\n' % fprint)

            if args.fprint_only:
                fprints_previous = set(fprints_set)
                continue

            # Certificates file _certs.gz - only new certificates
            fprints_new = list(fprints_set_new)
            fprints_new.sort()

            fprints_len = len(fprints_new)
            fprints_progress_unit = fprints_len / 100
            fprints_progress_last = 0
            logger.info('Dumping only new certificates, fprint db size: %d' %
                        fprints_len)

            with gzip.open(certfile, 'wb') as outfh:
                for rec_idx, fprint in enumerate(fprints_new):

                    if fprints_progress_last + fprints_progress_unit < rec_idx:
                        fprints_progress_last = rec_idx
                        outfh.flush()
                        logger.debug(' .. progress %s, mem: %s MB' %
                                     (rec_idx, utils.get_mem_usage() / 1024.0))

                    if fprint in bigdb:
                        outfh.write('%s,%s\n' %
                                    (fprint, base64.b64encode(bigdb[fprint])))

                    else:
                        not_found += 1

            logger.info(
                'Finished with idx %s, file %s, newfile: %s, not found: %s, mem: %s MB'
                % (test_idx, fname, certfile, not_found,
                   utils.get_mem_usage() / 1024.0))

            # Final step - store to previous
            fprints_previous = set(fprints_set)
    def work(self):
        """
        Processing
        :return: 
        """

        # Open the json link file
        args = self.args
        index_db = None
        with open(args.json, 'r') as fh:
            index_db = json.load(fh)
        self.index_db = index_db

        # Manage the raw_certificates file
        main_cert_rec = index_db['data'][1]
        main_cert_link = main_cert_rec['fhref']
        main_cert_file = os.path.join(args.datadir, os.path.basename(main_cert_link))
        json_res_file = os.path.join(args.datadir, 'eco.json')
        json_res_fh = open(json_res_file, 'w')

        iobj = None
        if os.path.exists(main_cert_file):
            iobj = input_obj.FileInputObject(fname=main_cert_file)

        elif os.path.exists(main_cert_file[:-3]):  # ungziped
            main_cert_file = main_cert_file[:-3]
            iobj = input_obj.FileInputObject(fname=main_cert_file)

        else:
            logger.info('Going to download certificate file')
            hosth = open(main_cert_file, 'wb')
            iobj = input_obj.ReconnectingLinkInputObject(url=main_cert_link, rec=main_cert_rec)
            iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True)

        # Big in memory hash table fprint -> certificate
        bigdb = {}
        counter = 0
        testrng = range(15, 171)

        # fprints seen
        fprints_seen_set = set()
        fprints_previous = set()

        # Process the main certificate file
        with iobj:
            fh = iobj
            if main_cert_file.endswith('.gz'):
                fh = gzipinputstream.GzipInputStream(fileobj=iobj)
            else:
                fh = NewlineIterator(iobj)

            for idx, line in enumerate(fh):
                try:
                    csv = line.split(',')
                    fprint = utils.strip_hex_prefix(csv[0].strip())
                    cert = utils.strip_hex_prefix(csv[1].strip())

                    certbin = base64.b16decode(cert, True)
                    bigdb[fprint] = certbin
                    counter += 1

                    if counter % 100000 == 0:
                        logger.debug(' .. progress %s, fprint %s, memory: %s MB'
                                     % (counter, fprint, utils.get_mem_mb()))

                except Exception as e:
                    logger.error('Error in processing %s' % e)
                    logger.debug(traceback.format_exc())

        logger.info('Uff... big DB loaded, num entries: %s' % len(bigdb))

        # Load sequential scans
        # Host file association ip -> fingerprint
        jsdb_ids = {x['id']: x for x in self.index_db['data']}
        last_file_date_utc = 0
        for test_idx in testrng:
            filerec = jsdb_ids[test_idx]
            fname = filerec['fname']
            flink = filerec['fhref']
            datepart = filerec['date']
            date_utc = filerec['date_utc']
            fname_2 = os.path.basename(fname)

            # As dataset is in a form of snapshots we can skip some time intervals.
            if self.args.space and date_utc - last_file_date_utc < (60*60*24*7 - 60*60):
                logger.info('Skipping record %d, as the time diff is too small from the previous one: %s'
                            % (test_idx, date_utc - last_file_date_utc))
                continue

            last_file_date_utc = date_utc
            hostfile = os.path.join(args.datadir, fname_2)
            certfile = os.path.join(args.datadir, '%s_certs.gz' % datepart)
            fprintfile_new = os.path.join(args.datadir, '%s_fprints_new.csv' % datepart)
            fprintfile_new_p = os.path.join(args.datadir, '%s_fprints_new_p.csv' % datepart)
            fprintfile_lost_p = os.path.join(args.datadir, '%s_fprints_lost_p.csv' % datepart)

            js_res_rec = collections.OrderedDict()
            js_res_rec['fname'] = fname
            js_res_rec['fhref'] = flink
            js_res_rec['date'] = datepart
            js_res_rec['date_utc'] = date_utc
            js_res_rec['hostfile'] = hostfile
            js_res_rec['certfile'] = certfile
            js_res_rec['fprintfile_new'] = fprintfile_new
            js_res_rec['rec'] = filerec

            logger.info('Processing test idx %s, file %s' % (test_idx, fname))

            not_found = 0
            fprints_set = set()
            fprints_set_new = set()
            iobj = None
            hosth = None

            # Open local or open remote (with download to local)
            if os.path.exists(hostfile):
                iobj = input_obj.FileInputObject(fname=hostfile)
            else:
                hosth = open(hostfile, 'wb')
                iobj = input_obj.ReconnectingLinkInputObject(url=flink, rec=filerec)
                iobj = input_obj.TeeInputObject(parent_fh=iobj, copy_fh=hosth, close_copy_on_exit=True)

            # Processing ip -> fingerprints associations
            with iobj:
                fh = gzipinputstream.GzipInputStream(fileobj=iobj)
                for rec_idx, rec in enumerate(fh):
                    try:
                        linerec = rec.strip().split(',')
                        ip = linerec[0].strip()
                        fprint = utils.strip_hex_prefix(linerec[2].strip())
                        fprints_set.add(fprint)

                        if rec_idx % 1000000 == 0:
                            iobj.flush()
                            logger.debug(' .. progress %s, ip %s, mem: %s MB'
                                         % (rec_idx, ip, utils.get_mem_mb()))

                    except Exception as e:
                        logger.error('Exception in processing rec %s: %s' % (rec_idx, e))
                        logger.debug(rec)
                        logger.debug(traceback.format_exc())

            fprints_len = len(fprints_set)
            logger.info('File processed, fprint db size: %d' % fprints_len)

            # Only fingerprints
            logger.info('Going to sort fprints...')
            fprints = list(fprints_set)
            fprints.sort()
            logger.info('fprints sorted. Storing fingerprints. Mem: %s MB' % (utils.get_mem_mb()))

            # Store only new fingerprints, not seen before
            logger.info('Storing new fingerprints. Mem: %s MB' % (utils.get_mem_mb()))
            with open(fprintfile_new, 'w') as outfh:
                for fprint in fprints:
                    if fprint not in fprints_seen_set:
                        outfh.write('%s\n' % fprint)
                        fprints_set_new.add(fprint)
                        fprints_seen_set.add(fprint)

            # Certificates new from previous
            logger.info('Storing new fingerprints from previous. Mem: %s MB' % (utils.get_mem_mb()))
            with open(fprintfile_new_p, 'w') as outfh:
                for fprint in fprints:
                    if fprint not in fprints_previous:
                        outfh.write('%s\n' % fprint)

            # Certificates removed from previous
            logger.info('Storing lost fingerprints from previous. Mem: %s MB' % (utils.get_mem_mb()))
            fprints_previous_list = list(fprints_previous)
            fprints_previous_list.sort()

            with open(fprintfile_lost_p, 'w') as outfh:
                for fprint in fprints_previous_list:
                    if fprint not in fprints_set:
                        outfh.write('%s\n' % fprint)

            # Certificates file _certs.gz - only new certificates, incremental records
            fprints_new = list(fprints_set_new)
            fprints_new.sort()

            fprints_len = len(fprints_new)
            fprints_progress_unit = fprints_len / 100
            fprints_progress_last = 0
            logger.info('Dumping only new certificates, fprint db size: %d' % fprints_len)

            with gzip.open(certfile, 'wb') as outfh:
                for rec_idx, fprint in enumerate(fprints_new):

                    if fprints_progress_last + fprints_progress_unit < rec_idx:
                        fprints_progress_last = rec_idx
                        outfh.flush()
                        logger.debug(' .. progress %s, mem: %s MB'
                                     % (rec_idx, utils.get_mem_mb()))

                    if fprint in bigdb:
                        outfh.write('%s,%s\n' % (fprint, base64.b64encode(bigdb[fprint])))

                    else:
                        not_found += 1

            logger.info('Finished with idx %s, file %s, newfile: %s, not found: %s, mem: %s MB'
                        % (test_idx, fname, certfile, not_found, utils.get_mem_mb()))

            # Final step - store to previous
            fprints_previous = set(fprints_set)

            # Result file record flush
            json_res_fh.write(json.dumps(js_res_rec) + '\n')
            json_res_fh.flush()

        json_res_fh.close()