def main(): args = docopt.docopt(__doc__) if args['--data-folder'] is None: # by default, use 'data' folder relative to this file args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data') if args['add']: annotate_forms( data_folder=args['--data-folder'], url_argument=args["<url>"], ) elif args['check-data']: check_annotated_data(args['--data-folder']) elif args['train']: ex = FormExtractor.trained_on( data_folder=args["--data-folder"], train_ratio=1.0, ) ex.save(args["<modelfile>"]) elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = FormExtractor.load(args["<modelfile>"]) print("Downloading data...") data, url = load_data(args["<url>"]) tree = load_html(data, url) result = ex.extract_forms_proba(tree, threshold) if not result: print("No forms found.") return for form, probs in result: print("-"*40) print_form_html(form) print("") for tp, prob in Counter(probs).most_common(): tp_full = FORM_TYPES_INV[tp] print("%s %0.1f%%" % (tp_full, prob*100), end=' ') print("") elif args['evaluate']: n_folds = int(args["--cv"]) ratio = float(args['--test-size']) store = Storage(args["--data-folder"]) model = get_model() X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True) test_size = int(len(y) * ratio) train_size = len(y) - test_size X_train, X_test, y_train, y_test = X[:train_size], X[train_size:], y[:train_size], y[train_size:] evaluation.print_metrics(model, X, y, X_train, X_test, y_train, y_test, ipython=False, cv=n_folds, short_matrix=True)
def annotate_forms(data_folder, url_argument): """ Run an interactive HTML form annotation tool. The process is to download a web page, display all HTML forms and for each form ask user about form type. The result is saved on disk: web page is stored as a html file and the URL and the annotation results are added to index.json file. """ storage = Storage(data_folder) html, url = load_data(url_argument) doc = load_html(html, url) answers = _annotate_forms(storage, doc) if answers: storage.store_result(html, answers, url)
def main(): args = docopt.docopt(__doc__) if args['--data-folder'] is None: # by default, use 'data' folder relative to this file args['--data-folder'] = os.path.join(os.path.dirname(__file__), 'data') if args['add']: annotate_forms( data_folder=args['--data-folder'], url_argument=args["<url>"], ) elif args['check-data']: check_annotated_data(args['--data-folder']) elif args['train']: ex = FormExtractor.trained_on( data_folder=args["--data-folder"], train_ratio=1.0, ) ex.save(args["<modelfile>"]) elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = FormExtractor.load(args["<modelfile>"]) print("Downloading data...") data, url = load_data(args["<url>"]) tree = load_html(data, url) result = ex.extract_forms_proba(tree, threshold) if not result: print("No forms found.") return for form, probs in result: print("-" * 40) print_form_html(form) print("") for tp, prob in Counter(probs).most_common(): tp_full = FORM_TYPES_INV[tp] print("%s %0.1f%%" % (tp_full, prob * 100), end=' ') print("") elif args['evaluate']: n_folds = int(args["--cv"]) ratio = float(args['--test-size']) store = Storage(args["--data-folder"]) model = get_model() X, y = store.get_Xy(drop_duplicates=True, verbose=True, leave=True) test_size = int(len(y) * ratio) train_size = len(y) - test_size X_train, X_test, y_train, y_test = X[:train_size], X[ train_size:], y[:train_size], y[train_size:] evaluation.print_metrics(model, X, y, X_train, X_test, y_train, y_test, ipython=False, cv=n_folds, short_matrix=True)