Example #1
0
def histogram(training_set, ruleset, trainee, training_cache, delay, tabs,
              show_browser, buckets, rules):
    """Show a histogram of rule scores.

    We also break down what proportion of each bucket comprised positive or
    negative samples. Altogether, this gives you an idea whether a rule is
    broadly applicable, discriminatory, and spitting out what you expect.

    """
    training_set = Path(training_set)
    if training_set.is_dir():
        if not ruleset:
            raise BadOptionUsage(
                'ruleset',
                'A --ruleset file must be specified when TRAINING_SET_FOLDER is passed a directory.'
            )
        if not trainee:
            raise BadOptionUsage(
                'trainee',
                'A --trainee ID must be specified when TRAINING_SET_FOLDER is passed a directory.'
            )

    training_data = make_or_find_vectors(ruleset, trainee, training_set,
                                         training_cache, show_browser,
                                         'training', delay, tabs)
    training_pages = training_data['pages']
    x, y, num_yes, _ = tensors_from(training_pages)
    feature_names = training_data['header']['featureNames']
    print_feature_report(
        feature_metrics(feature_names, x, y, buckets, rules or feature_names))
Example #2
0
def get_tortoise_config(ctx: Context, tortoise_orm: str) -> dict:
    """
    get tortoise config from module
    :param ctx:
    :param tortoise_orm:
    :return:
    """
    splits = tortoise_orm.split(".")
    config_path = ".".join(splits[:-1])
    tortoise_config = splits[-1]
    try:
        config_module = importlib.import_module(config_path)
    except (ModuleNotFoundError, AttributeError):
        raise BadOptionUsage(ctx=ctx,
                             message=f'No config named "{config_path}"',
                             option_name="--config")

    config = getattr(config_module, tortoise_config, None)
    if not config:
        raise BadOptionUsage(
            option_name="--config",
            message=
            f'Can\'t get "{tortoise_config}" from module "{config_module}"',
            ctx=ctx,
        )
    return config
Example #3
0
def main(testing_set, weights, confidence_threshold, ruleset, trainee,
         testing_cache, delay, show_browser, verbose):
    """Compute the accuracy of the given coefficients and biases on a folder of
    testing samples.

    TESTING_SET_FOLDER is a directory of labeled testing pages. It can also be,
    for backward compatibility, a JSON file of vectors from FathomFox's
    Vectorizer.

    WEIGHTS should be a JSON-formatted object like this. You can paste it
    directly from the output of fathom-train.

        {"coeffs": [["nextAnchorIsJavaScript", 1.1627885103225708],
        ["nextButtonTypeSubmit", 4.613410949707031],
        ["nextInputTypeSubmit", 4.374269008636475]],

        "bias": -8.645608901977539}

    """
    testing_set = Path(testing_set)
    if testing_set.is_dir():
        if not ruleset:
            raise BadOptionUsage(
                'ruleset',
                'A --ruleset file must be specified when TESTING_SET_FOLDER is passed a directory.'
            )
        if not trainee:
            raise BadOptionUsage(
                'trainee',
                'A --trainee ID must be specified when TESTING_SET_FOLDER is passed a directory.'
            )

    with make_or_find_vectors(ruleset, trainee, testing_set, testing_cache,
                              show_browser, 'testing',
                              delay).open(encoding='utf-8') as testing_file:
        testing_data = load(testing_file)
    testing_pages = testing_data['pages']
    x, y, num_yes = tensors_from(testing_pages)
    model = model_from_json(weights, len(y[0]),
                            testing_data['header']['featureNames'])

    accuracy, false_positives, false_negatives = accuracy_per_tag(
        y, model(x), confidence_threshold)
    print(
        pretty_accuracy('Testing', accuracy, len(x), false_positives,
                        false_negatives, num_yes))

    if testing_pages and 'time' in testing_pages[0]:
        print(speed_readout(testing_pages))

    if verbose:
        print('\nTesting per-tag results:')
        print_per_tag_report([
            per_tag_metrics(page, model, confidence_threshold)
            for page in testing_pages
        ])
Example #4
0
File: tg.py Project: soar/imapmon
    def check_settings(self):
        if not self.settings.telegram_bot_token:
            raise BadOptionUsage(
                'channel',
                'Telegram bot token is required for the Telegram channel'
            )

        if not self.settings.telegram_chat_id:
            raise BadOptionUsage(
                'channel',
                'Telegram Chat ID is required for the Telegram channel'
            )
Example #5
0
def get_tortoise_config(ctx: Context, tortoise_orm: str) -> dict:
    """
    get tortoise config from module
    :param ctx:
    :param tortoise_orm:
    :return:
    """
    splits = tortoise_orm.split(".")
    config_path = ".".join(splits[:-1])
    tortoise_config = splits[-1]

    try:
        config_module = importlib.import_module(config_path)
    except ModuleNotFoundError as e:
        raise ClickException(
            f"Error while importing configuration module: {e}") from None

    config = getattr(config_module, tortoise_config, None)
    if not config:
        raise BadOptionUsage(
            option_name="--config",
            message=
            f'Can\'t get "{tortoise_config}" from module "{config_module}"',
            ctx=ctx,
        )
    return config
Example #6
0
def hunt(urls, threads, exclude_flags, include_flags, interesting_extensions, interesting_files, stdout_flags,
         progress_enabled, timeout, max_depth, not_follow_subdomains, exclude_sources, proxies, delay,
         not_allow_redirects):
    """Find web directories without bruteforce
    """
    if exclude_flags and include_flags:
        raise BadOptionUsage('--exclude-flags and --include-flags are mutually exclusive.')
    welcome()
    if not urls:
        click.echo('•_•) OOPS! Add urls to analyze.\nFor example: dirhunt http://domain/path\n\n'
                   'Need help? Then use dirhunt --help', err=True)
        return
    exclude_flags, include_flags = flags_range(exclude_flags), flags_range(include_flags)
    progress_enabled = (sys.stdout.isatty() or sys.stderr.isatty()) if progress_enabled is None else progress_enabled
    crawler = Crawler(max_workers=threads, interesting_extensions=interesting_extensions,
                      interesting_files=interesting_files, std=sys.stdout if sys.stdout.isatty() else sys.stderr,
                      progress_enabled=progress_enabled, timeout=timeout, depth=max_depth,
                      not_follow_subdomains=not_follow_subdomains, exclude_sources=exclude_sources,
                      not_allow_redirects=not_allow_redirects, proxies=proxies, delay=delay)
    crawler.add_init_urls(*urls)
    try:
        catch_keyboard_interrupt(crawler.print_results, crawler.restart)(set(exclude_flags), set(include_flags))
    except SystemExit:
        crawler.close()
    crawler.print_urls_info()
    if not sys.stdout.isatty():
        output_urls(crawler, stdout_flags)
Example #7
0
def main(database: str, output: str) -> None:
    """Export rdial data for use with timew.

    Writes timew compatible data to ‘output’.
    """
    if exists(output):
        raise BadOptionUsage('output', 'Output path must not exist')
    files = process_events(database)
    write_events(output, files)
Example #8
0
def main(input: TextIO, output: str) -> None:
    """Export timew data for use with rdial.

    Reads the output of ‘timew export’, and writes rdial compatible data to
    ‘output’.
    """
    if path.exists(output):
        raise BadOptionUsage('output', 'Output path must not exist')
    files = process_records(input)
    write_events(output, files)
Example #9
0
def hunt(urls, threads, exclude_flags, include_flags, interesting_extensions, interesting_files, stdout_flags,
         progress_enabled, timeout, max_depth, not_follow_subdomains, exclude_sources, proxies, delay,
         not_allow_redirects, limit, to_file):
    """Find web directories without bruteforce
    """
    if exclude_flags and include_flags:
        raise BadOptionUsage('--exclude-flags and --include-flags are mutually exclusive.')
    welcome()
    urls = flat_list(urls)
    proxies = multiplier_args(proxies)
    if not urls:
        click.echo('•_•) OOPS! Add urls to analyze.\nFor example: dirhunt http://domain/path\n\n'
                   'Need help? Then use dirhunt --help', err=True)
        return
    exclude_flags, include_flags = flags_range(exclude_flags), flags_range(include_flags)
    progress_enabled = (sys.stdout.isatty() or sys.stderr.isatty()) if progress_enabled is None else progress_enabled
    crawler = Crawler(max_workers=threads, interesting_extensions=interesting_extensions,
                      interesting_files=interesting_files, std=sys.stdout if sys.stdout.isatty() else sys.stderr,
                      progress_enabled=progress_enabled, timeout=timeout, depth=max_depth,
                      not_follow_subdomains=not_follow_subdomains, exclude_sources=exclude_sources,
                      not_allow_redirects=not_allow_redirects, proxies=proxies, delay=delay, limit=limit,
                      to_file=to_file)
    if os.path.exists(crawler.get_resume_file()):
        click.echo('Resuming the previous program execution...')
        try:
            crawler.resume(crawler.get_resume_file())
        except IncompatibleVersionError as e:
            click.echo(e)
    crawler.add_init_urls(*urls)
    while True:
        choice = catch_keyboard_interrupt_choices(crawler.print_results, ['abort', 'continue', 'results'], 'a')\
            (set(exclude_flags), set(include_flags))
        if choice == 'a':
            crawler.close(True)
            click.echo('Created resume file "{}". Run again using the same parameters to resume.'.format(
                crawler.get_resume_file())
            )
            return
        elif choice == 'c':
            crawler.restart()
            continue
        else:
            break
    crawler.print_urls_info()
    if not sys.stdout.isatty():
        output_urls(crawler, stdout_flags)
    if to_file:
        crawler.create_report(to_file)
    if not to_file and os.path.exists(crawler.get_resume_file()):
        # The resume file exists. Deleting...
        os.remove(crawler.get_resume_file())
Example #10
0
def get_app_connection_name(config, app_name: str) -> str:
    """
    get connection name
    :param config:
    :param app_name:
    :return:
    """
    app = config.get("apps").get(app_name)
    if app:
        return app.get("default_connection", "default")
    raise BadOptionUsage(
        option_name="--app",
        message=f'Can\'t get app named "{app_name}"',
    )
    def handle_cli_args(
            self,
            url=None,
            platform=None,
            model_name=None,
            scenario_name=None,
            version=None,
            local_data=None,
            _store_as=("platform_info", "scenario_info"),
    ):
        """Handle command-line arguments.

        May update the :attr:`data_path`, :attr:`platform_info`, :attr:`scenario_info`,
        and/or :attr:`url` settings.
        """
        # Store the path to command-specific data and metadata
        if local_data:
            self.local_data = local_data

        # References to the Context settings to be updated
        platform_info = self.setdefault(_store_as[0], dict())
        scenario_info = self.setdefault(_store_as[1], dict())

        # Store information for the target Platform
        if url:
            if platform or model_name or scenario_name or version:
                raise BadOptionUsage(
                    "--platform --model --scenario and/or --version",
                    " redundant with --url",
                )

            self.url = url
            urlinfo = ixmp.utils.parse_url(url)
            platform_info.update(urlinfo[0])
            scenario_info.update(urlinfo[1])
        elif platform:
            platform_info["name"] = platform

        # Store information about the target Scenario
        if model_name:
            scenario_info["model"] = model_name
        if scenario_name:
            scenario_info["scenario"] = scenario_name
        if version:
            scenario_info["version"] = version
Example #12
0
    def __init__(self, settings: Settings):
        self.settings = settings
        self.mailbox = MailBox(self.settings.imap_hostname)
        self.mailbox.login(self.settings.imap_username,
                           self.settings.imap_password)

        self.spam_filters = [
            re.compile(spam_filter) for spam_filter in settings.spam_filters
        ]

        self.channels: typing.Dict[str, BaseChannel] = {}
        for channel_name in settings.channels:
            channel_class = self.CHANNELS.get(channel_name)
            if channel_class:
                self.channels[channel_name] = channel_class(settings)
            else:
                raise BadOptionUsage('channel',
                                     f'Channel {channel_name} is not defined')
Example #13
0
def main(training_set, validation_set, ruleset, trainee, training_cache,
         validation_cache, delay, show_browser, stop_early, learning_rate,
         iterations, pos_weight, comment, quiet, confidence_threshold, layers,
         exclude):
    """Compute optimal numerical parameters for a Fathom ruleset.

    The usual invocation is something like this::

        fathom-train samples/training --validation-set samples/validation --ruleset rulesets.js --trainee new

    The first argument is a directory of labeled training pages. It can also
    be, for backward compatibility, a JSON file of vectors from FathomFox's
    Vectorizer.

    To see graphs of loss functions, install TensorBoard, then run
    ``tensorboard --logdir runs/``. These will tell you whether you need to
    adjust the ``--learning-rate``.

    Definitions of terms used in output:

    \b
    pruned
        Said of a node that was prematurely eliminated from consideration
        because it did not match the selector of any ``dom()`` call in the
        ruleset

    \b
    target
        A "right answer": a labeled, positive DOM node, one that should be
        recognized.

    """
    training_set = Path(training_set)

    # If they pass in a dir for either the training or validation sets, we need
    # a ruleset and a trainee for vectorizing:
    if (validation_set and validation_set.is_dir()) or training_set.is_dir():
        if not ruleset:
            raise BadOptionUsage(
                'ruleset',
                'A --ruleset file must be specified when TRAINING_SET_FOLDER or --validation-set are passed a directory.'
            )
        if not trainee:
            raise BadOptionUsage(
                'trainee',
                'A --trainee ID must be specified when TRAINING_SET_FOLDER or --validation-set are passed a directory.'
            )

    training_data = exclude_features(
        exclude,
        make_or_find_vectors(ruleset, trainee, training_set, training_cache,
                             show_browser, 'training', delay))
    training_pages = training_data['pages']
    x, y, num_yes, num_prunes = tensors_from(training_pages, shuffle=True)
    num_samples = len(x) + num_prunes

    if validation_set:
        validation_pages = exclude_features(
            exclude,
            make_or_find_vectors(ruleset, trainee, validation_set,
                                 validation_cache, show_browser, 'validation',
                                 delay))['pages']
        validation_ins, validation_outs, validation_yes, validation_prunes = tensors_from(
            validation_pages)
        validation_arg = validation_ins, validation_outs
    else:
        validation_arg = None

    layers = list(layers)  # Comes in as tuple
    full_comment = '.LR={l},i={i}{c}'.format(l=learning_rate,
                                             i=iterations,
                                             c=(',' +
                                                comment) if comment else '')
    model = learn(learning_rate,
                  iterations,
                  x,
                  y,
                  confidence_threshold,
                  num_prunes,
                  validation=validation_arg,
                  stop_early=stop_early,
                  run_comment=full_comment,
                  pos_weight=pos_weight,
                  layers=layers)

    print(pretty_coeffs(model, training_data['header']['featureNames']))
    accuracy, false_positives, false_negatives = accuracy_per_tag(
        y, model(x), confidence_threshold, num_prunes)
    print(
        pretty_accuracy('Training', accuracy, num_samples, false_positives,
                        false_negatives, num_yes + num_prunes))
    if validation_set:
        accuracy, false_positives, false_negatives = accuracy_per_tag(
            validation_outs, model(validation_ins), confidence_threshold,
            validation_prunes)
        print(
            pretty_accuracy('Validation', accuracy, len(validation_ins),
                            false_positives, false_negatives,
                            validation_yes + validation_prunes))

    # Print timing information:
    if training_pages and 'time' in training_pages[0]:
        if validation_set and validation_pages and 'time' in validation_pages[
                0]:
            print(speed_readout(training_pages + validation_pages))
        else:
            print(speed_readout(training_pages))

    if not quiet:
        print('\nTraining per-tag results:')
        print_per_tag_report([
            per_tag_metrics(page, model, confidence_threshold)
            for page in training_pages
        ])
        if validation_set:
            print('\nValidation per-tag results:')
            print_per_tag_report([
                per_tag_metrics(page, model, confidence_threshold)
                for page in validation_pages
            ])
Example #14
0
def option_error():
    raise BadOptionUsage("Incorrect option", "What a pity")
Example #15
0
def fetcher(config, show_list, fetchers, check, pool_size,
            https_only, http_check_https, no_socks, save):
    from .proxyfetcher import ProxyFetcher
    from .proxychecker import ProxyChecker

    if show_list:
        for fetcher in ProxyFetcher.registry.values():
            echo(fetcher.name +' '+ fetcher.__module__ + ':' + fetcher.__name__)
        return

    proxies = OrderedDict()

    checker = None
    if check:
        conf = config.get('proxyfetcher', {})
        if http_check_https:
            conf['https_force_check'] = True
        if pool_size:
            conf['pool_size'] = pool_size
        blacklist = conf.pop('blacklist', None)
        if not blacklist:
            conf['blacklist'] = proxies
        else:
            # Do not check already checked proxies
            conf['blacklist'] = CompositeContains(blacklist, proxies)

        checker = ProxyChecker(**conf)

    json_encoder = JSONEncoder(**config.get('json', {}))

    def proxy(proxy):
        if proxy.addr in proxies:
            proxies[proxy.addr].merge_meta(proxy)
        else:
            proxies[proxy.addr] = proxy

    conf = config.get('proxyfetcher', {})
    fetchers_ = conf.pop('fetchers', None)
    if fetchers == '*':
        fetchers_ = ProxyFetcher.registry
    elif fetchers:
        fetchers_ = fetchers.split(',')
    if not fetchers:
        raise BadOptionUsage('You should specify fetchers with option or in config.')

    types = set(t.upper() for t in
                conf.pop('types', ['HTTP', 'HTTPS', 'SOCKS4', 'SOCKS5']))
    if https_only and not http_check_https:
        types = set(('HTTPS', 'SOCKS4', 'SOCKS5'))
    if no_socks:
        types = types.difference(['SOCKS4', 'SOCKS5'])
    if not types:
        raise BadOptionUsage('Proxy types appears to be empty. '
                             'Check config and options compability.')
    if pool_size:
        conf['pool_size'] = pool_size

    fetcher = ProxyFetcher(fetchers_, checker=checker, proxy=proxy, types=types, **conf)
    fetcher(join=True)

    http_count, socks_count, sources = 0, 0, {}
    for p in proxies.values():
        if tuple(p.types)[0].name.startswith('HTTP'):
            http_count += 1
        else:
            socks_count += 1
        for source in p.fetch_sources:
            sources.setdefault(source, {'total': 0, 'uniq': 0})
            sources[source]['total'] += 1
            if len(p.fetch_sources) == 1:
                sources[source]['uniq'] += 1
    sources = ', '.join(['{0}:total={1[total]} uniq={1[uniq]}'.format(k, v)
                         for k, v in sources.items()])
    logging.info('Fetched %s proxies (http(s)=%s, socks=%s %s)',
        len(proxies), http_count, socks_count, sources)

    json_encoder.dump(proxies.values(), save or sys.stdout)
Example #16
0
def main(training_set, validation_set, ruleset, trainee, training_cache, validation_cache, delay, show_browser, stop_early, learning_rate, iterations, pos_weight, comment, quiet, confidence_threshold, layers, exclude):
    """Compute optimal numerical parameters for a Fathom ruleset.

    There are a lot of options, but the usual invocation is something like...

      fathom-train samples/training --validation-set samples/validation --stop-early --ruleset rulesets.js --trainee new

    TRAINING_SET_FOLDER is a directory of labeled training pages. It can also
    be, for backward compatibility, a JSON file of vectors from FathomFox's
    Vectorizer.

    To see graphs of the results, install TensorBoard, then run this:
    tensorboard --logdir runs/. These will tell you whether you need to adjust
    the --learning-rate.

    Some vocab used in the output messages:

      target -- A "right answer" DOM node, one that should be recognized

      candidate -- Any node (target or not) brought into the ruleset, by a
      dom() call, for consideration

      negative sample -- A sample with no intended target nodes, used to bait
      the recognizer into a false-positive choice

    """
    training_set = Path(training_set)

    # If they pass in a dir for either the training or validation sets, we need
    # a ruleset and a trainee for vectorizing:
    if (validation_set and validation_set.is_dir()) or training_set.is_dir():
        if not ruleset:
            raise BadOptionUsage('ruleset', 'A --ruleset file must be specified when TRAINING_SET_FOLDER or --validation-set are passed a directory.')
        if not trainee:
            raise BadOptionUsage('trainee', 'A --trainee ID must be specified when TRAINING_SET_FOLDER or --validation-set are passed a directory.')

    with open(make_or_find_vectors(ruleset,
                                   trainee,
                                   training_set,
                                   training_cache,
                                   show_browser,
                                   'training',
                                   delay),
              encoding='utf-8') as training_file:
        training_data = exclude_features(exclude, load(training_file))
    training_pages = training_data['pages']
    x, y, num_yes = tensors_from(training_pages, shuffle=True)

    if validation_set:
        with open(make_or_find_vectors(ruleset,
                                       trainee,
                                       validation_set,
                                       validation_cache,
                                       show_browser,
                                       'validation',
                                       delay),
                  encoding='utf-8') as validation_file:
            validation_pages = exclude_features(exclude, load(validation_file))['pages']
        validation_ins, validation_outs, validation_yes = tensors_from(validation_pages)
        validation_arg = validation_ins, validation_outs
    else:
        validation_arg = None

    layers = list(layers)  # Comes in as tuple
    full_comment = '.LR={l},i={i}{c}'.format(
        l=learning_rate,
        i=iterations,
        c=(',' + comment) if comment else '')
    model = learn(learning_rate,
                  iterations,
                  x,
                  y,
                  confidence_threshold,
                  validation=validation_arg,
                  stop_early=stop_early,
                  run_comment=full_comment,
                  pos_weight=pos_weight,
                  layers=layers)

    print(pretty_coeffs(model, training_data['header']['featureNames']))
    accuracy, false_positives, false_negatives = accuracy_per_tag(y, model(x), confidence_threshold)
    print(pretty_accuracy('Training',
                          accuracy,
                          len(x),
                          false_positives,
                          false_negatives,
                          num_yes))
    if validation_set:
        accuracy, false_positives, false_negatives = accuracy_per_tag(validation_outs, model(validation_ins), confidence_threshold)
        print(pretty_accuracy('Validation',
                              accuracy,
                              len(validation_ins),
                              false_positives,
                              false_negatives,
                              validation_yes))

    # Print timing information:
    if training_pages and 'time' in training_pages[0]:
        if validation_set and validation_pages and 'time' in validation_pages[0]:
            print(speed_readout(training_pages + validation_pages))
        else:
            print(speed_readout(training_pages))

    if not quiet:
        print('\nTraining per-tag results:')
        print_per_tag_report([per_tag_metrics(page, model, confidence_threshold) for page in training_pages])
        if validation_set:
            print('\nValidation per-tag results:')
            print_per_tag_report([per_tag_metrics(page, model, confidence_threshold) for page in validation_pages])