def on_submit(_): url = url_field.value.strip() html = download(url) path = storage.add_result(html, url, add_empty=False) if path is None: print("No forms at ", url) else: print("Added:", path, url) url_field.value = ""
def on_submit(_): url = url_field.value.strip() html = download(url) path = storage.add_result(html, url, add_empty=False) if path is None: print(("No forms at ", url)) else: print(("Added:", path, url)) url_field.value = ""
def main(): args = docopt.docopt(__doc__, version=formasaurus.__version__) data_folder = args['--data-folder'] if data_folder is None: data_folder = DEFAULT_DATA_PATH storage = Storage(data_folder) if args['check-data']: errors = storage.check() storage.print_form_type_counts(simplify=False) storage.print_form_type_counts(simplify=True) print("Errors:", errors) if errors: sys.exit(1) elif args['train']: ex = formasaurus.FormFieldClassifier.trained_on(data_folder) ex.save(args["<modelfile>"]) elif args['init']: formasaurus.FormFieldClassifier.load() elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"]) print("Downloading {}...".format(args["<url>"])) data = download(args["<url>"]) tree = load_html(data, args['<url>']) result = ex.extract_forms(tree, proba=True, threshold=threshold) if not result: print("No forms found.") return for form, info in result: print("\n") print("=" * 60) print(get_cleaned_form_html(form)) print("-" * 60) print("Form type: ", end="") for form_tp, prob in Counter(info['form']).most_common(): print("%s %0.1f%%" % (form_tp, prob * 100), end=' ') print("\n\nField types:") for field_name, probs in info['fields'].items(): print(field_name, end=': ') for field_tp, prob in Counter(probs).most_common(): print("%s %0.1f%%" % (field_tp, prob * 100), end=' ') print("") print("") elif args['evaluate']: n_splits = int(args["--cv"]) annotations = list( storage.iter_annotations(verbose=True, leave=True, simplify_form_types=True, simplify_field_types=True)) if args['forms'] or args['all']: print("Evaluating form classifier...\n") formtype_model.print_classification_report(annotations, n_splits=n_splits) print("") if args['fields'] or args['all']: print("Evaluating form field classifier...\n") fieldtype_model.print_classification_report(annotations, n_splits=n_splits)
def main(): args = docopt.docopt(__doc__, version=formasaurus.__version__) data_folder = args['--data-folder'] if data_folder is None: data_folder = DEFAULT_DATA_PATH storage = Storage(data_folder) if args['check-data']: errors = storage.check() storage.print_form_type_counts(simplify=False) storage.print_form_type_counts(simplify=True) print("Errors:", errors) if errors: sys.exit(1) elif args['train']: ex = formasaurus.FormFieldClassifier.trained_on(data_folder) ex.save(args["<modelfile>"]) elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"]) print("Downloading data...") data = download(args["<url>"]) tree = load_html(data, args['<url>']) result = ex.extract_forms(tree, proba=True, threshold=threshold) if not result: print("No forms found.") return for form, info in result: print("\n") print("="*60) print(get_cleaned_form_html(form)) print("-"*60) print("Form type: ", end="") for form_tp, prob in Counter(info['form']).most_common(): print("%s %0.1f%%" % (form_tp, prob * 100), end=' ') print("\n\nField types:") for field_name, probs in info['fields'].items(): print(field_name, end=': ') for field_tp, prob in Counter(probs).most_common(): print("%s %0.1f%%" % (field_tp, prob * 100), end=' ') print("") print("") elif args['evaluate']: n_folds = int(args["--cv"]) annotations = list( storage.iter_annotations(verbose=True, leave=True, simplify_form_types=True, simplify_field_types=True) ) if args['forms'] or args['all']: print("Evaluating form classifier...\n") formtype_model.print_classification_report(annotations, n_folds=n_folds) print("") if args['fields'] or args['all']: print("Evaluating form field classifier...\n") fieldtype_model.print_classification_report(annotations, n_folds=n_folds)