def test_load_html(): html = b"<div><b></b><b></b></div>" tree = load_html(html) assert len(tree.xpath('//b')) == 2 tree2 = load_html(html.decode('ascii')) assert len(tree2.xpath('//b')) == 2 tree3 = load_html(tree) assert tree3 is tree
def test_get_cleaned_form_html_human_readable(): form = load_html(FORM1) html = get_cleaned_form_html(form, human_readable=True) assert 'style' not in html assert 'script' not in html assert 'div' not in html old_fields = [(f.name, f.value) for f in get_fields_to_annotate(form)] new_fields = [(f.name, f.value) for f in get_fields_to_annotate(load_html(html))] assert old_fields == new_fields
def test_add_text_before(): tree = load_html("<div><p>hello<br/>world</p><i>X</i></div>") add_text_before(tree.xpath('//br')[0], ",") add_text_before(tree.xpath('//p')[0], "!") add_text_before(tree.xpath('//i')[0], "1") assert html_tostring( tree).strip() == "<div>!<p>hello,<br>world</p>1<i>X</i>\n</div>"
def check(self, verbose=True): """ Check that items in storage are correct; print the problems found. Return the number of errors found. """ index = self.get_index() items = list(index.items()) errors = 0 if verbose: items = tqdm(items, "Checking", leave=True, mininterval=0, ascii=True, ncols=80, unit=' files') for fn, info in items: fn_full = os.path.join(self.folder, fn) if not os.path.exists(fn_full): print("\nFile not found: %r" % fn_full) errors += 1 continue with open(fn_full, 'rb') as f: data = f.read() doc = load_html(data, info['url']) if len(doc.xpath("//form")) != len(info["forms"]): errors += 1 msg = "\nInvalid form count for entry %r: expected %d, got %d" % ( fn, len(doc.xpath("//form")), len(info["forms"])) print(msg) if 'visible_html_fields' not in info: errors += 1 print("No fields data for entry {!r}".format(fn)) else: fields = info['visible_html_fields'] if len(fields) != len(doc.xpath('//form')): errors += 1 print( "Invalid number of form field annotations for entry {!r}" .format(fn)) else: for idx, (form, fields_info) in enumerate( zip(doc.xpath('//form'), fields)): elems = get_fields_to_annotate(form) names = {elem.name for elem in elems} if names != set(fields_info.keys()): errors += 1 print("Invalid field names for form #{}, " "entry {!r}. Expected: {}, found: {}".format( idx, fn, names, set(fields_info.keys()))) if not errors: print("Status: OK") else: print("Status: %d error(s) found" % errors) return errors
def test_get_fields_to_annotate(): tree = load_html(FORM1) form = get_forms(tree)[0] elems = get_fields_to_annotate(form) assert all(getattr(el, 'name', None) for el in elems) names = get_field_names(elems) assert names == ['foo', 'bar', 'ch', 'baz', 'go', 'cancel'] assert set(names) == {el.name for el in elems}
def test_get_forms(): forms = get_forms(load_html(""" <p>some text</p> <form action="/go">hi</form> <FORM method='post'><input name='foo'></FORM> """)) assert len(forms) == 2 assert forms[0].action == "/go" assert forms[1].method == "POST"
def get_tree(self, path, info=None): """ Load a single tree. ``path`` is a relative path to a file (key in index.json file), ``info`` is annotation data (value in index.json file). """ if info is None: info = self.get_index()[path] with open(os.path.join(self.folder, path), "rb") as f: return load_html(f.read(), info["url"])
def test_get_forms(): forms = get_forms( load_html(""" <p>some text</p> <form action="/go">hi</form> <FORM method='post'><input name='foo'></FORM> """)) assert len(forms) == 2 assert forms[0].action == "/go" assert forms[1].method == "POST"
def check(self): """ Check that items in storage are correct; print the problems found. Return the number of errors found. """ index = self.get_index() items = list(index.items()) errors = 0 for fn, info in tqdm(items, "Checking", leave=True, mininterval=0, ascii=True, ncols=80, unit=' files'): fn_full = os.path.join(self.folder, fn) if not os.path.exists(fn_full): print("\nFile not found: %r" % fn_full) errors += 1 continue with open(fn_full, 'rb') as f: data = f.read() doc = load_html(data, info['url']) if len(doc.xpath("//form")) != len(info["forms"]): errors += 1 msg = "\nInvalid form count for entry %r: expected %d, got %d" % ( fn, len(doc.xpath("//form")), len(info["forms"]) ) print(msg) if 'visible_html_fields' not in info: errors += 1 print("No fields data for entry {!r}".format(fn)) else: fields = info['visible_html_fields'] if len(fields) != len(doc.xpath('//form')): errors += 1 print("Invalid number of form field annotations for entry {!r}".format(fn)) else: for idx, (form, fields_info) in enumerate(zip(doc.xpath('//form'), fields)): elems = get_fields_to_annotate(form) names = {elem.name for elem in elems} if names != set(fields_info.keys()): errors += 1 print("Invalid field names for form #{}, " "entry {!r}. Expected: {}, found: {}".format( idx, fn, names, set(fields_info.keys()) )) if not errors: print("Status: OK") else: print("Status: %d error(s) found" % errors) return errors
def extract_forms(self, tree_or_html, proba=False, threshold=0.05): """ Given a lxml tree or HTML source code, return a list of ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results of :meth:`classify` or :meth:`classify_proba`` calls, depending on ``proba`` parameter. """ forms = get_forms(load_html(tree_or_html)) if proba: return [(form, self.classify_proba(form, threshold)) for form in forms] else: return [(form, self.classify(form)) for form in forms]
def add_result(self, html, url, form_answers=None, visible_html_fields=None, index=None, add_empty=True): """ Save HTML source and its <form> and form field types. """ forms = get_forms(load_html(html)) if not add_empty: if not len(forms): return if all(len(get_fields_to_annotate(form)) == 0 for form in forms): return if form_answers is None: form_schema = self.get_form_schema() form_answers = [form_schema.na_value for _ in forms] else: assert len(form_answers) == len(forms) if visible_html_fields is None: field_schema = self.get_field_schema() visible_html_fields = [{ name: field_schema.na_value for name in get_field_names(get_fields_to_annotate(form)) } for form in forms] filename = self.generate_filename(url) path = os.path.relpath(filename, self.folder) if index is None: index = self.get_index() index[path] = { "url": url, "forms": form_answers, "visible_html_fields": visible_html_fields, } with open(filename, 'wb') as f: if not isinstance(html, bytes): html = html.encode('utf8') f.write(html) self.write_index(index) return path
def extract_forms(self, tree_or_html, proba=False, threshold=0.05, fields=True): """ Given a lxml tree or HTML source code, return a list of ``(form_elem, form_info)`` tuples. ``form_info`` dicts contain results of :meth:`classify` or :meth:`classify_proba`` calls, depending on ``proba`` parameter. When ``fields`` is False, field type information is not computed. """ if isinstance(tree_or_html, (six.string_types, bytes)): tree = load_html(tree_or_html) else: tree = tree_or_html forms = get_forms(tree) if proba: return [(form, self.classify_proba(form, threshold, fields)) for form in forms] else: return [(form, self.classify(form, fields)) for form in forms]
def test_get_text_around_elems(): tree = load_html(""" <form> <h1>Login</h1> Please <b>enter</b> your details <p> Username: <input name='username'/> required <div>Email:</div> <input type='text' name='email'> * </p> Thanks! </form> """) elems = get_fields_to_annotate(tree) user, email = elems before, after = get_text_around_elems(tree, elems) assert len(before) == 2 assert before[user] == 'Login Please enter your details Username:'******'required Email:' assert len(after) == 2 assert after[user] == 'required Email:' assert after[email] == '* Thanks!' get_text_around_elems(tree, []) == {}, {}
def main(): args = docopt.docopt(__doc__, version=formasaurus.__version__) data_folder = args['--data-folder'] if data_folder is None: data_folder = DEFAULT_DATA_PATH storage = Storage(data_folder) if args['check-data']: errors = storage.check() storage.print_form_type_counts(simplify=False) storage.print_form_type_counts(simplify=True) print("Errors:", errors) if errors: sys.exit(1) elif args['train']: ex = formasaurus.FormFieldClassifier.trained_on(data_folder) ex.save(args["<modelfile>"]) elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"]) print("Downloading data...") data = download(args["<url>"]) tree = load_html(data, args['<url>']) result = ex.extract_forms(tree, proba=True, threshold=threshold) if not result: print("No forms found.") return for form, info in result: print("\n") print("="*60) print(get_cleaned_form_html(form)) print("-"*60) print("Form type: ", end="") for form_tp, prob in Counter(info['form']).most_common(): print("%s %0.1f%%" % (form_tp, prob * 100), end=' ') print("\n\nField types:") for field_name, probs in info['fields'].items(): print(field_name, end=': ') for field_tp, prob in Counter(probs).most_common(): print("%s %0.1f%%" % (field_tp, prob * 100), end=' ') print("") print("") elif args['evaluate']: n_folds = int(args["--cv"]) annotations = list( storage.iter_annotations(verbose=True, leave=True, simplify_form_types=True, simplify_field_types=True) ) if args['forms'] or args['all']: print("Evaluating form classifier...\n") formtype_model.print_classification_report(annotations, n_folds=n_folds) print("") if args['fields'] or args['all']: print("Evaluating form field classifier...\n") fieldtype_model.print_classification_report(annotations, n_folds=n_folds)
def main(): args = docopt.docopt(__doc__, version=formasaurus.__version__) data_folder = args['--data-folder'] if data_folder is None: data_folder = DEFAULT_DATA_PATH storage = Storage(data_folder) if args['check-data']: errors = storage.check() storage.print_form_type_counts(simplify=False) storage.print_form_type_counts(simplify=True) print("Errors:", errors) if errors: sys.exit(1) elif args['train']: ex = formasaurus.FormFieldClassifier.trained_on(data_folder) ex.save(args["<modelfile>"]) elif args['init']: formasaurus.FormFieldClassifier.load() elif args['run']: threshold = float(args['--threshold']) print("Loading the extractor..") ex = formasaurus.FormFieldClassifier.load(args["<modelfile>"]) print("Downloading {}...".format(args["<url>"])) data = download(args["<url>"]) tree = load_html(data, args['<url>']) result = ex.extract_forms(tree, proba=True, threshold=threshold) if not result: print("No forms found.") return for form, info in result: print("\n") print("=" * 60) print(get_cleaned_form_html(form)) print("-" * 60) print("Form type: ", end="") for form_tp, prob in Counter(info['form']).most_common(): print("%s %0.1f%%" % (form_tp, prob * 100), end=' ') print("\n\nField types:") for field_name, probs in info['fields'].items(): print(field_name, end=': ') for field_tp, prob in Counter(probs).most_common(): print("%s %0.1f%%" % (field_tp, prob * 100), end=' ') print("") print("") elif args['evaluate']: n_splits = int(args["--cv"]) annotations = list( storage.iter_annotations(verbose=True, leave=True, simplify_form_types=True, simplify_field_types=True)) if args['forms'] or args['all']: print("Evaluating form classifier...\n") formtype_model.print_classification_report(annotations, n_splits=n_splits) print("") if args['fields'] or args['all']: print("Evaluating form field classifier...\n") fieldtype_model.print_classification_report(annotations, n_splits=n_splits)
def test_add_text_before_root(): tree = load_html("<p>hello<br/>world</p>") add_text_before(tree.xpath('//p')[0], "!") assert html_tostring(tree).strip() == "!<p>hello<br>world</p>"
def test_html_tostring(): src = "<form><input value='hello'><input type='submit'></form>" tree = load_html(src) assert html_tostring(tree) == """<form>
def test_add_text_after(): tree = load_html("<p>hello,<br/>world</p>") add_text_after(tree.xpath('//br')[0], "brave new ") add_text_after(tree.xpath('//p')[0], "!") assert html_tostring(tree).strip() == "<p>hello,<br>brave new world</p>!"
def test_add_text_before(): tree = load_html("<div><p>hello<br/>world</p><i>X</i></div>") add_text_before(tree.xpath('//br')[0], ",") add_text_before(tree.xpath('//p')[0], "!") add_text_before(tree.xpath('//i')[0], "1") assert html_tostring(tree).strip() == "<div>!<p>hello,<br>world</p>1<i>X</i>\n</div>"