def empty_storage(tmpdir): storage = Storage(str(tmpdir)) config = { "form_types": { "types": [ {"short": "s", "full": "search"}, {"short": "l", "full": "login"}, {"short": "o", "full": "other"}, {"short": "X", "full": "NOT ANNOTATED"} ], "simplify_map": { "l": "o", }, "NA_value": "X", "skip_value": "-" }, "field_types": { "types": [ {"short": "us", "full": "username"}, {"short": "p1", "full": "password"}, {"short": "qq", "full": "search query"}, {"short": "XX", "full": "NOT ANNOTATED"} ], "simplify_map": {}, "NA_value": "XX", "skip_value": "--" } } storage.initialize(config) return storage
def main(): args = docopt.docopt(__doc__) if args['--data-folder'] is None: # by default, use 'data' folder relative to this file args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data') if args['add']: annotate_forms( data_folder=args['--data-folder'], url_argument=args["<url>"], ) elif args['check-data']: check_annotated_data(args['--data-folder']) elif args['train']: ex = FormExtractor.trained_on( data_folder=args["--data-folder"], train_ratio=1.0, ) ex.save(args["<modelfile>"]) elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = FormExtractor.load(args["<modelfile>"]) print("Downloading data...") data, url = load_data(args["<url>"]) tree = load_html(data, url) result = ex.extract_forms_proba(tree, threshold) if not result: print("No forms found.") return for form, probs in result: print("-"*40) print_form_html(form) print("") for tp, prob in Counter(probs).most_common(): tp_full = FORM_TYPES_INV[tp] print("%s %0.1f%%" % (tp_full, prob*100), end=' ') print("") elif args['evaluate']: n_folds = int(args["--cv"]) ratio = float(args['--test-size']) store = Storage(args["--data-folder"]) model = get_model() X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True) test_size = int(len(y) * ratio) train_size = len(y) - test_size X_train, X_test, y_train, y_test = X[:train_size], X[train_size:], y[:train_size], y[train_size:] evaluation.print_metrics(model, X, y, X_train, X_test, y_train, y_test, ipython=False, cv=n_folds, short_matrix=True)
def check_annotated_data(data_folder): """ Check that annotated data is correct; exit with code 1 if it is not. """ storage = Storage(data_folder) ok = storage.check() storage.print_type_counts() if not ok: sys.exit(1)
def check_annotated_data(data_folder): """ Check that annotated data is correct; exit with code 1 if it is not. """ storage = Storage(data_folder) errors = storage.check() storage.print_type_counts() print("Errors:", errors) if errors: sys.exit(1)
def train(self, data_folder, train_ratio=1.0): """ Train the model using data from ``data_folder``. """ store = Storage(data_folder) X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True) train_size = int(len(y) * train_ratio) X, y = X[:train_size], y[:train_size] model = get_model() print("Training on %d example(s)..." % len(y)) model.fit(X, y) self.model = model
def trained_on(cls, data_folder): """ Return Formasaurus object trained on data from data_folder """ store = Storage(data_folder) print("Loading training data...") annotations = list(store.iter_annotations( simplify_form_types=True, simplify_field_types=True, verbose=True, leave=True, )) ex = cls() ex.train(annotations) return ex
def annotate_forms(data_folder, url_argument): """ Run an interactive HTML form annotation tool. The process is to download a web page, display all HTML forms and for each form ask user about form type. The result is saved on disk: web page is stored as a html file and the URL and the annotation results are added to index.json file. """ storage = Storage(data_folder) html, url = load_data(url_argument) doc = load_html(html, url) answers = _annotate_forms(storage, doc) if answers: storage.store_result(html, answers, url)
def main(): args = docopt.docopt(__doc__) if args['--data-folder'] is None: # by default, use 'data' folder relative to this file args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data') if args['add']: annotate_forms( data_folder=args['--data-folder'], url_argument=args["<url>"], ) elif args['check-data']: check_annotated_data(args['--data-folder']) elif args['train']: ex = FormExtractor.trained_on( data_folder=args["--data-folder"], train_ratio=1.0, ) ex.save(args["<modelfile>"]) elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = FormExtractor.load(args["<modelfile>"]) print("Downloading data...") data, url = load_data(args["<url>"]) tree = load_html(data, url) result = ex.extract_forms_proba(tree, threshold) if not result: print("No forms found.") return for form, probs in result: print("-" * 40) print_form_html(form) print("") for tp, prob in Counter(probs).most_common(): tp_full = FORM_TYPES_INV[tp] print("%s %0.1f%%" % (tp_full, prob * 100), end=' ') print("") elif args['evaluate']: n_folds = int(args["--cv"]) ratio = float(args['--test-size']) store = Storage(args["--data-folder"]) model = get_model() X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True) test_size = int(len(y) * ratio) train_size = len(y) - test_size X_train, X_test, y_train, y_test = X[:train_size], X[ train_size:], y[:train_size], y[train_size:] evaluation.print_metrics(model, X, y, X_train, X_test, y_train, y_test, ipython=False, cv=n_folds, short_matrix=True)
def main(): args = docopt.docopt(__doc__, version=formasaurus.__version__) data_folder = args['--data-folder'] if data_folder is None: data_folder = DEFAULT_DATA_PATH storage = Storage(data_folder) if args['check-data']: errors = storage.check() storage.print_form_type_counts(simplify=False) storage.print_form_type_counts(simplify=True) print("Errors:", errors) if errors: sys.exit(1) elif args['train']: ex = formasaurus.FormFieldClassifier.trained_on(data_folder) ex.save(args["<modelfile>"]) elif args['init']: formasaurus.FormFieldClassifier.load() elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"]) print("Downloading {}...".format(args["<url>"])) data = download(args["<url>"]) tree = load_html(data, args['<url>']) result = ex.extract_forms(tree, proba=True, threshold=threshold) if not result: print("No forms found.") return for form, info in result: print("\n") print("=" * 60) print(get_cleaned_form_html(form)) print("-" * 60) print("Form type: ", end="") for form_tp, prob in Counter(info['form']).most_common(): print("%s %0.1f%%" % (form_tp, prob * 100), end=' ') print("\n\nField types:") for field_name, probs in info['fields'].items(): print(field_name, end=': ') for field_tp, prob in Counter(probs).most_common(): print("%s %0.1f%%" % (field_tp, prob * 100), end=' ') print("") print("") elif args['evaluate']: n_splits = int(args["--cv"]) annotations = list( storage.iter_annotations(verbose=True, leave=True, simplify_form_types=True, simplify_field_types=True)) if args['forms'] or args['all']: print("Evaluating form classifier...\n") formtype_model.print_classification_report(annotations, n_splits=n_splits) print("") if args['fields'] or args['all']: print("Evaluating form field classifier...\n") fieldtype_model.print_classification_report(annotations, n_splits=n_splits)
def storage(): return Storage(DEFAULT_DATA_PATH)
def main(): args = docopt.docopt(__doc__, version=formasaurus.__version__) data_folder = args['--data-folder'] if data_folder is None: data_folder = DEFAULT_DATA_PATH storage = Storage(data_folder) if args['check-data']: errors = storage.check() storage.print_form_type_counts(simplify=False) storage.print_form_type_counts(simplify=True) print("Errors:", errors) if errors: sys.exit(1) elif args['train']: ex = formasaurus.FormFieldClassifier.trained_on(data_folder) ex.save(args["<modelfile>"]) elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"]) print("Downloading data...") data = download(args["<url>"]) tree = load_html(data, args['<url>']) result = ex.extract_forms(tree, proba=True, threshold=threshold) if not result: print("No forms found.") return for form, info in result: print("\n") print("="*60) print(get_cleaned_form_html(form)) print("-"*60) print("Form type: ", end="") for form_tp, prob in Counter(info['form']).most_common(): print("%s %0.1f%%" % (form_tp, prob * 100), end=' ') print("\n\nField types:") for field_name, probs in info['fields'].items(): print(field_name, end=': ') for field_tp, prob in Counter(probs).most_common(): print("%s %0.1f%%" % (field_tp, prob * 100), end=' ') print("") print("") elif args['evaluate']: n_folds = int(args["--cv"]) annotations = list( storage.iter_annotations(verbose=True, leave=True, simplify_form_types=True, simplify_field_types=True) ) if args['forms'] or args['all']: print("Evaluating form classifier...\n") formtype_model.print_classification_report(annotations, n_folds=n_folds) print("") if args['fields'] or args['all']: print("Evaluating form field classifier...\n") fieldtype_model.print_classification_report(annotations, n_folds=n_folds)