Example #1
0
    def run(self):
        while self._max_executions > 0:
            # Stop if our master died.
            if self._ppid != os.getppid():
                break

            # Our master asked us to stop. We must obey.
            if self._stop_event.is_set():
                break
            job = self._job_queue.get_job_nowait()
            if job is None:
                time.sleep(1)
                continue
            start_info = (job.scan_id, job.scan_module.name, datetime.today(),
                          job.num_tries)
            self._notify_master('job_started', start_info)
            result = Result(job.current_result, NoOpFileHandler())
            logger = logging.Logger(job.scan_module.name)
            logger.addHandler(
                WorkerWritePipeHandler(self._pid, self._write_pipe))
            logger.addHandler(ScanStreamHandler())
            scan_meta = ScanMeta(worker_id=self._id, num_tries=job.num_tries)
            with tempfile.TemporaryDirectory() as temp_dir:
                old_cwd = os.getcwd()
                os.chdir(temp_dir)
                try:
                    job.scan_module.scan_site(result, logger, job.options,
                                              scan_meta)
                except RetryScan:
                    self._job_queue.report_failure()
                    self._notify_master('job_failed', (datetime.today(), ))
                except RescheduleLater as e:
                    self._job_queue.reschedule(e.not_before)
                    self._job_queue.report_result(result.get_updates())
                    self._notify_master('job_finished', (datetime.today(), ))
                except Exception:
                    logger.exception('Scan module `{}` failed.'.format(
                        job.scan_module.name))
                    self._job_queue.report_failure()
                    self._notify_master('job_failed', (datetime.today(), ))
                    if self._raven_client:
                        self._raven_client.captureException(
                            tags={
                                'scan_id': job.scan_id,
                                'scan_module_name': job.scan_module.name
                            },
                            extra={'result': result.get_results()})
                else:
                    self._job_queue.report_result(result.get_updates())
                    self._notify_master('job_finished', (datetime.today(), ))
                finally:
                    os.chdir(old_cwd)
                    kill_everything(self._pid, only_children=True)
            self._max_executions -= 1
        kill_everything(self._pid)
Example #2
0
def test_site(url: str, previous_results: dict, scan_basedir: str,
              virtualenv_path: str) -> Dict[str, Dict[str, Union[str, bytes]]]:
    """Test a site using openwpm and related tests."""

    result = {
        'raw_url': {
            'mime_type': 'text/plain',
            'data': url.encode(),
        }
    }

    if previous_results.get(
            'dns_error') or not previous_results.get('reachable'):
        #print("Skipping OpenWPM due to previous error")
        return result

    # ensure basedir exists
    if not os.path.isdir(scan_basedir):
        os.mkdir(scan_basedir)

    # create scan dir
    scan_dir = os.path.join(scan_basedir, str(uuid4()))
    os.mkdir(scan_dir)

    file_handler = DirectoryFileHandler(scan_dir)
    logger = logging.getLogger()
    num_tries = 1
    while True:
        try:
            scanner_result = Result({'site_url': url}, file_handler)
            with get_worker_id() as worker_id:
                meta = ScanMeta(worker_id=worker_id, num_tries=num_tries)
                scan_site(scanner_result, logger, {}, meta)
            break
        except RetryScan:
            if num_tries >= 3:
                result['crawldata'] = {
                    'mime_type': 'application/json',
                    'data': json.dumps(None).encode(),
                }
                return result

            num_tries += 1
            time.sleep(10)

    # screenshot
    if os.path.isfile(os.path.join(scan_dir, 'files/screenshot.png')):
        with open(os.path.join(scan_dir, 'files/screenshot.png'), 'rb') as f:
            result['screenshot'] = {
                'mime_type': 'image/png',
                'data': f.read(),
            }

    # crawl result
    result['crawldata'] = {
        'mime_type': 'application/json',
        'data': json.dumps(scanner_result.get_results()).encode(),
    }

    # cropped and pixelized screenshot
    if 'screenshot' in result:
        out = BytesIO()
        pixelize_screenshot(BytesIO(result['screenshot']['data']), out)
        result['cropped_screenshot'] = {
            'mime_type': 'image/png',
            'data': out.getvalue(),
        }

    # recursively delete scan folder
    shutil.rmtree(scan_dir)

    return result
Example #3
0
def scan_site(args):
    config = load_config(args.config)
    _require_dependencies(config)

    site_parsed = urlparse(args.site)
    if site_parsed.scheme not in ('http', 'https'):
        raise CommandError('Invalid site: {}'.format(args.site))

    results_dir = args.results
    if results_dir is None:
        results_dir = slugify(site_parsed.netloc) + '_'
        results_dir += hashlib.sha512(args.site.encode()).hexdigest()[:10]
    results_dir = Path(results_dir).resolve()
    try:
        results_dir.mkdir(exist_ok=True)
    except IOError as e:
        raise CommandError(
            'Could not create results directory: {}'.format(e)) from e

    result_file = results_dir / 'results.json'
    result_json = {'site_url': args.site}
    if args.import_results:
        try:
            with open(args.import_results) as f:
                import_json = json.load(f)
        except IOError as e:
            raise CommandError(
                'Could not open result JSON: {}'.format(e)) from e
        except ValueError as e:
            raise CommandError(
                'Could not parse result JSON: {}'.format(e)) from e
        else:
            result_json.update(import_json)
    try:
        with result_file.open('w') as f:
            json.dump(result_json, f, indent=2)
            f.write('\n')
    except IOError as e:
        raise CommandError('Could not write result JSON: {}'.format(e)) from e

    scan_modules = load_modules(config['SCAN_MODULES'],
                                config['SCAN_MODULE_OPTIONS'])
    scan_module_names = args.scan_modules

    if scan_module_names is None:
        scan_module_names = scan_modules.keys()

    # Order scan_module_names by dependency topologically
    dependencies = {}
    for scan_module_name in scan_module_names:
        mod = scan_modules[scan_module_name]
        dependencies[mod.name] = set(mod.dependencies)
    scan_module_names = toposort_flatten(dependencies)

    if args.skip_dependencies:
        scan_module_names = [
            scan_module_name for scan_module_name in scan_module_names
            if scan_module_name in args.scan_modules
        ]

    has_error = False
    result = Result(result_json, DirectoryFileHandler(results_dir))
    stream_handler = ScanStreamHandler()
    logs_dir = results_dir / 'logs'
    logs_dir.mkdir(exist_ok=True)
    lock_dir = config['STORAGE_PATH'] / 'locks'
    lock_dir.mkdir(exist_ok=True)
    scan_queue = [
        QueueEntry(mod_name, 0, None) for mod_name in scan_module_names
    ]
    scan_queue.reverse()
    while scan_queue:
        scan_module_name, num_try, not_before = scan_queue.pop()
        if not_before is not None:
            # noinspection PyTypeChecker
            while datetime.utcnow() < not_before:
                time.sleep(0.5)
        mod = scan_modules[scan_module_name]
        num_try += 1
        log_filename = (logs_dir / (mod.name + '.log'))
        file_handler = ScanFileHandler(str(log_filename))
        logger = logging.Logger(mod.name)
        logger.addHandler(stream_handler)
        logger.addHandler(file_handler)
        with tempfile.TemporaryDirectory() as temp_dir:
            old_cwd = os.getcwd()
            os.chdir(temp_dir)
            logger.info('Starting %s', mod.name)
            try:
                with NumericLock(lock_dir) as worker_id:
                    scan_meta = ScanMeta(worker_id=worker_id,
                                         num_tries=num_try)
                    mod.logger = logger
                    mod.scan_site(result, scan_meta)
            except RetryScan:
                if num_try <= config['MAX_TRIES']:
                    scan_queue.append(
                        QueueEntry(scan_module_name, num_try, not_before))
                    logger.info('Scan module `%s` will be retried', mod.name)
                else:
                    has_error = True
            except RescheduleLater as e:
                scan_queue.append(
                    QueueEntry(scan_module_name, num_try, e.not_before))
            except Exception:
                if num_try <= config['MAX_TRIES']:
                    scan_queue.append(
                        QueueEntry(scan_module_name, num_try, not_before))
                has_error = True
                logger.exception('Scan module `%s` failed.', mod.name)
            finally:
                os.chdir(old_cwd)
                with result_file.open('w') as f:
                    json.dump(result.get_results(),
                              f,
                              indent=2,
                              sort_keys=True)
                    f.write('\n')
            logger.info('Finished %s', mod.name)
    pprint.pprint(result.get_results())
    if has_error:
        sys.exit(1)