def main():
    args = docopt.docopt(__doc__)

    if args['--data-folder'] is None:
        # by default, use 'data' folder relative to this file
        args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data')

    if args['add']:
        annotate_forms(
            data_folder=args['--data-folder'],
            url_argument=args["<url>"],
        )

    elif args['check-data']:
        check_annotated_data(args['--data-folder'])

    elif args['train']:
        ex = FormExtractor.trained_on(
            data_folder=args["--data-folder"],
            train_ratio=1.0,
        )
        ex.save(args["<modelfile>"])

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = FormExtractor.load(args["<modelfile>"])
        print("Downloading data...")
        data, url = load_data(args["<url>"])
        tree = load_html(data, url)

        result = ex.extract_forms_proba(tree, threshold)
        if not result:
            print("No forms found.")
            return

        for form, probs in result:
            print("-"*40)
            print_form_html(form)
            print("")
            for tp, prob in Counter(probs).most_common():
                tp_full = FORM_TYPES_INV[tp]
                print("%s %0.1f%%" % (tp_full, prob*100), end='    ')

            print("")

    elif args['evaluate']:
        n_folds = int(args["--cv"])
        ratio = float(args['--test-size'])

        store = Storage(args["--data-folder"])
        model = get_model()
        X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True)

        test_size = int(len(y) * ratio)
        train_size = len(y) - test_size
        X_train, X_test, y_train, y_test = X[:train_size], X[train_size:], y[:train_size], y[train_size:]

        evaluation.print_metrics(model, X, y, X_train, X_test, y_train, y_test,
                                 ipython=False, cv=n_folds, short_matrix=True)
Example #2
0
def annotate_forms(data_folder, url_argument):
    """
    Run an interactive HTML form annotation tool.

    The process is to download a web page, display all HTML forms and for
    each form ask user about form type. The result is saved on disk:
    web page is stored as a html file and the URL and the annotation
    results are added to index.json file.
    """
    storage = Storage(data_folder)
    html, url = load_data(url_argument)
    doc = load_html(html, url)
    answers = _annotate_forms(storage, doc)
    if answers:
        storage.store_result(html, answers, url)
Example #3
0
def annotate_forms(data_folder, url_argument):
    """
    Run an interactive HTML form annotation tool.

    The process is to download a web page, display all HTML forms and for
    each form ask user about form type. The result is saved on disk:
    web page is stored as a html file and the URL and the annotation
    results are added to index.json file.
    """
    storage = Storage(data_folder)
    html, url = load_data(url_argument)
    doc = load_html(html, url)
    answers = _annotate_forms(storage, doc)
    if answers:
        storage.store_result(html, answers, url)
Example #4
0
def main():
    args = docopt.docopt(__doc__)

    if args['--data-folder'] is None:
        # by default, use 'data' folder relative to this file
        args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data')

    if args['add']:
        annotate_forms(
            data_folder=args['--data-folder'],
            url_argument=args["<url>"],
        )

    elif args['check-data']:
        check_annotated_data(args['--data-folder'])

    elif args['train']:
        ex = FormExtractor.trained_on(
            data_folder=args["--data-folder"],
            train_ratio=1.0,
        )
        ex.save(args["<modelfile>"])

    elif args['run']:
        threshold = float(args['--threshold'])
        print("Loading the extractor..")
        ex = FormExtractor.load(args["<modelfile>"])
        print("Downloading data...")
        data, url = load_data(args["<url>"])
        tree = load_html(data, url)

        result = ex.extract_forms_proba(tree, threshold)
        if not result:
            print("No forms found.")
            return

        for form, probs in result:
            print("-" * 40)
            print_form_html(form)
            print("")
            for tp, prob in Counter(probs).most_common():
                tp_full = FORM_TYPES_INV[tp]
                print("%s %0.1f%%" % (tp_full, prob * 100), end='    ')

            print("")

    elif args['evaluate']:
        n_folds = int(args["--cv"])
        ratio = float(args['--test-size'])

        store = Storage(args["--data-folder"])
        model = get_model()
        X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True)

        test_size = int(len(y) * ratio)
        train_size = len(y) - test_size
        X_train, X_test, y_train, y_test = X[:train_size], X[
            train_size:], y[:train_size], y[train_size:]

        evaluation.print_metrics(model,
                                 X,
                                 y,
                                 X_train,
                                 X_test,
                                 y_train,
                                 y_test,
                                 ipython=False,
                                 cv=n_folds,
                                 short_matrix=True)