コード例 #1
0
def test_get_cleaned_form_html_human_readable():
    form = load_html(FORM1)
    html = get_cleaned_form_html(form, human_readable=True)
    assert 'style' not in html
    assert 'script' not in html
    assert 'div' not in html

    old_fields = [(f.name, f.value) for f in get_fields_to_annotate(form)]
    new_fields = [(f.name, f.value)
                  for f in get_fields_to_annotate(load_html(html))]
    assert old_fields == new_fields
コード例 #2
0
def test_get_cleaned_form_html_human_readable():
    form = load_html(FORM1)
    html = get_cleaned_form_html(form, human_readable=True)
    assert 'style' not in html
    assert 'script' not in html
    assert 'div' not in html

    old_fields = [(f.name, f.value) for f in get_fields_to_annotate(form)]
    new_fields = [(f.name, f.value)
                  for f in get_fields_to_annotate(load_html(html))]
    assert old_fields == new_fields
コード例 #3
0
ファイル: classifiers.py プロジェクト: charx0r/formasaurus
    def classify_proba(self, form, threshold=0.0):
        """
        Return dict with probabilities of ``form`` and its fields belonging
        to various form and field classes::

            {
                'form': {'type1': prob1, 'type2': prob2, ...},
                'fields': {
                    'name': {'type1': prob1, 'type2': prob2, ...},
                    ...
                }
            }

        ``form`` should be an lxml HTML <form> element.
        Only classes with probability >= ``threshold`` are preserved.
        """
        form_types_proba = self.form_classifier.classify_proba(form, threshold)
        form_type = max(form_types_proba, key=lambda p: form_types_proba[p])
        field_elems = get_fields_to_annotate(form)
        xseq = fieldtype_model.get_form_features(form, form_type, field_elems)
        yseq = self._field_model.predict_marginals_single(xseq)

        return {
            'form': form_types_proba,
            'fields': {
                elem.name: thresholded(probs, threshold)
                for elem, probs in zip(field_elems, yseq)
            },
        }
コード例 #4
0
def get_form_features(form, form_type, field_elems=None):
    """
    Return a list of feature dicts, a dict per visible submittable
    field in a <form> element.
    """
    if field_elems is None:
        field_elems = get_fields_to_annotate(form)
    text_before, text_after = get_text_around_elems(form, field_elems)
    res = [_elem_features(elem) for elem in field_elems]

    for idx, elem_feat in enumerate(res):
        if idx == 0:
            elem_feat['is-first'] = True
        if idx == len(res)-1:
            elem_feat['is-last'] = True

        elem_feat['form-type'] = form_type
        # get text before element
        text = normalize(text_before[field_elems[idx]])
        tokens = tokenize(text)[-6:]
        elem_feat['text-before'] = token_ngrams(tokens, 1, 2)

        # get text after element
        text = normalize(text_after[field_elems[idx]])
        tokens = tokenize(text)[:5]
        elem_feat['text-after'] = token_ngrams(tokens, 1, 2)
        elem_feat['bias'] = 1

    return res
コード例 #5
0
    def check(self, verbose=True):
        """
        Check that items in storage are correct; print the problems found.
        Return the number of errors found.
        """
        index = self.get_index()
        items = list(index.items())
        errors = 0
        if verbose:
            items = tqdm(items,
                         "Checking",
                         leave=True,
                         mininterval=0,
                         ascii=True,
                         ncols=80,
                         unit=' files')
        for fn, info in items:
            fn_full = os.path.join(self.folder, fn)
            if not os.path.exists(fn_full):
                print("\nFile not found: %r" % fn_full)
                errors += 1
                continue

            with open(fn_full, 'rb') as f:
                data = f.read()

            doc = load_html(data, info['url'])
            if len(doc.xpath("//form")) != len(info["forms"]):
                errors += 1
                msg = "\nInvalid form count for entry %r: expected %d, got %d" % (
                    fn, len(doc.xpath("//form")), len(info["forms"]))
                print(msg)

            if 'visible_html_fields' not in info:
                errors += 1
                print("No fields data for entry {!r}".format(fn))
            else:
                fields = info['visible_html_fields']
                if len(fields) != len(doc.xpath('//form')):
                    errors += 1
                    print(
                        "Invalid number of form field annotations for entry {!r}"
                        .format(fn))
                else:
                    for idx, (form, fields_info) in enumerate(
                            zip(doc.xpath('//form'), fields)):
                        elems = get_fields_to_annotate(form)
                        names = {elem.name for elem in elems}
                        if names != set(fields_info.keys()):
                            errors += 1
                            print("Invalid field names for form #{}, "
                                  "entry {!r}. Expected: {}, found: {}".format(
                                      idx, fn, names, set(fields_info.keys())))

        if not errors:
            print("Status: OK")
        else:
            print("Status: %d error(s) found" % errors)

        return errors
コード例 #6
0
def get_form_features(form, form_type, field_elems=None):
    """
    Return a list of feature dicts, a dict per visible submittable
    field in a <form> element.
    """
    if field_elems is None:
        field_elems = get_fields_to_annotate(form)
    text_before, text_after = get_text_around_elems(form, field_elems)
    res = [_elem_features(elem) for elem in field_elems]

    for idx, elem_feat in enumerate(res):
        if idx == 0:
            elem_feat['is-first'] = True
        if idx == len(res) - 1:
            elem_feat['is-last'] = True

        elem_feat['form-type'] = form_type
        # get text before element
        text = normalize(text_before[field_elems[idx]])
        tokens = tokenize(text)[-6:]
        elem_feat['text-before'] = token_ngrams(tokens, 1, 2)

        # get text after element
        text = normalize(text_after[field_elems[idx]])
        tokens = tokenize(text)[:5]
        elem_feat['text-after'] = token_ngrams(tokens, 1, 2)
        elem_feat['bias'] = 1

    return res
コード例 #7
0
ファイル: annotation.py プロジェクト: charx0r/formasaurus
 def field_elems(self):
     """
     Return a list of lxml Elements for fields which are annotated.
     Fields are returned in in order they appear in form;
     only visible submittable fields are considered.
     """
     return get_fields_to_annotate(self.form)
コード例 #8
0
 def field_elems(self):
     """
     Return a list of lxml Elements for fields which are annotated.
     Fields are returned in in order they appear in form;
     only visible submittable fields are considered.
     """
     return get_fields_to_annotate(self.form)
コード例 #9
0
def test_get_fields_to_annotate():
    tree = load_html(FORM1)
    form = get_forms(tree)[0]
    elems = get_fields_to_annotate(form)
    assert all(getattr(el, 'name', None) for el in elems)
    names = get_field_names(elems)
    assert names == ['foo', 'bar', 'ch', 'baz', 'go', 'cancel']
    assert set(names) == {el.name for el in elems}
コード例 #10
0
ファイル: storage.py プロジェクト: uzbekdev1/Formasaurus
    def add_result(self,
                   html,
                   url,
                   form_answers=None,
                   visible_html_fields=None,
                   index=None,
                   add_empty=True):
        """
        Save HTML source and its <form> and form field types.
        """
        forms = get_forms(load_html(html))
        if not add_empty:
            if not len(forms):
                return

            if all(len(get_fields_to_annotate(form)) == 0 for form in forms):
                return

        if form_answers is None:
            form_schema = self.get_form_schema()
            form_answers = [form_schema.na_value for _ in forms]
        else:
            assert len(form_answers) == len(forms)

        if visible_html_fields is None:
            field_schema = self.get_field_schema()
            visible_html_fields = [{
                name: field_schema.na_value
                for name in get_field_names(get_fields_to_annotate(form))
            } for form in forms]

        filename = self.generate_filename(url)
        path = os.path.relpath(filename, self.folder)
        if index is None:
            index = self.get_index()
        index[path] = {
            "url": url,
            "forms": form_answers,
            "visible_html_fields": visible_html_fields,
        }
        with open(filename, 'wb') as f:
            if not isinstance(html, bytes):
                html = html.encode('utf8')
            f.write(html)
        self.write_index(index)
        return path
コード例 #11
0
def test_get_fields_to_annotate():
    tree = load_html(FORM1)
    form = get_forms(tree)[0]
    elems = get_fields_to_annotate(form)
    assert all(getattr(el, 'name', None) for el in elems)
    names = get_field_names(elems)
    assert names == ['foo', 'bar', 'ch', 'baz', 'go', 'cancel']
    assert set(names) == {el.name for el in elems}
コード例 #12
0
ファイル: storage.py プロジェクト: JonathanBowker/Formasaurus
    def add_result(self, html, url, form_answers=None,
                   visible_html_fields=None, index=None,
                   add_empty=True):
        """
        Save HTML source and its <form> and form field types.
        """
        forms = get_forms(load_html(html))
        if not add_empty:
            if not len(forms):
                return

            if all(len(get_fields_to_annotate(form)) == 0 for form in forms):
                return

        if form_answers is None:
            form_schema = self.get_form_schema()
            form_answers = [form_schema.na_value for _ in forms]
        else:
            assert len(form_answers) == len(forms)

        if visible_html_fields is None:
            field_schema = self.get_field_schema()
            visible_html_fields = [{
                name: field_schema.na_value
                for name in get_field_names(get_fields_to_annotate(form))
            } for form in forms]

        filename = self.generate_filename(url)
        path = os.path.relpath(filename, self.folder)
        if index is None:
            index = self.get_index()
        index[path] = {
            "url": url,
            "forms": form_answers,
            "visible_html_fields": visible_html_fields,
        }
        with open(filename, 'wb') as f:
            if not isinstance(html, bytes):
                html = html.encode('utf8')
            f.write(html)
        self.write_index(index)
        return path
コード例 #13
0
ファイル: widgets.py プロジェクト: Python3pkg/Formasaurus
def FormAnnotator(ann,
                  annotate_fields=True,
                  annotate_types=True,
                  max_fields=80):
    """
    Widget for annotating a single HTML form.
    """
    assert annotate_fields or annotate_types
    form_types_inv = ann.form_schema.types_inv

    children = []

    if annotate_types:
        children += [FormTypeSelect(ann)]

    tpl = """
    <h4>
        {tp} <a href='{url}'>{url}</a>
        <small>{key} #{index}</small>
    </h4>
    """
    header = widgets.HTML(
        tpl.format(url=ann.url,
                   index=ann.index,
                   key=ann.key,
                   tp=form_types_inv.get(ann.type, '?')))
    children += [header]

    if annotate_fields:
        pages = []
        names = get_field_names(get_fields_to_annotate(ann.form))
        if len(names) > max_fields:
            children += [
                widgets.HTML("<h4>Too many fields ({})</h4>".format(
                    len(names)))
            ]
        else:
            for name in names:
                field_type_select = FieldTypeSelect(ann, name)
                html_view = HtmlView(ann.form, name)
                page = widgets.Box(children=[field_type_select, html_view])
                pages.append(page)

            field_tabs = widgets.Tab(children=pages, padding=4)
            for idx, name in enumerate(names):
                field_tabs.set_title(idx, name)

            children += [field_tabs]
    else:
        children += [HtmlView(ann.form)]

    return widgets.VBox(children, padding=8)
コード例 #14
0
ファイル: storage.py プロジェクト: JonathanBowker/Formasaurus
    def check(self):
        """
        Check that items in storage are correct; print the problems found.
        Return the number of errors found.
        """
        index = self.get_index()
        items = list(index.items())
        errors = 0
        for fn, info in tqdm(items, "Checking", leave=True, mininterval=0,
                             ascii=True, ncols=80, unit=' files'):
            fn_full = os.path.join(self.folder, fn)
            if not os.path.exists(fn_full):
                print("\nFile not found: %r" % fn_full)
                errors += 1
                continue

            with open(fn_full, 'rb') as f:
                data = f.read()

            doc = load_html(data, info['url'])
            if len(doc.xpath("//form")) != len(info["forms"]):
                errors += 1
                msg = "\nInvalid form count for entry %r: expected %d, got %d" % (
                         fn, len(doc.xpath("//form")), len(info["forms"])
                      )
                print(msg)

            if 'visible_html_fields' not in info:
                errors += 1
                print("No fields data for entry {!r}".format(fn))
            else:
                fields = info['visible_html_fields']
                if len(fields) != len(doc.xpath('//form')):
                    errors += 1
                    print("Invalid number of form field annotations for entry {!r}".format(fn))
                else:
                    for idx, (form, fields_info) in enumerate(zip(doc.xpath('//form'), fields)):
                        elems = get_fields_to_annotate(form)
                        names = {elem.name for elem in elems}
                        if names != set(fields_info.keys()):
                            errors += 1
                            print("Invalid field names for form #{}, "
                                  "entry {!r}. Expected: {}, found: {}".format(
                                idx, fn, names, set(fields_info.keys())
                            ))

        if not errors:
            print("Status: OK")
        else:
            print("Status: %d error(s) found" % errors)

        return errors
コード例 #15
0
ファイル: widgets.py プロジェクト: JonathanBowker/Formasaurus
def FormAnnotator(ann, annotate_fields=True, annotate_types=True, max_fields=80):
    """
    Widget for annotating a single HTML form.
    """
    assert annotate_fields or annotate_types
    form_types_inv = ann.form_schema.types_inv

    children = []

    if annotate_types:
        children += [FormTypeSelect(ann)]

    tpl = """
    <h4>
        {tp} <a href='{url}'>{url}</a>
        <small>{key} #{index}</small>
    </h4>
    """
    header = widgets.HTML(tpl.format(
        url=ann.url,
        index=ann.index,
        key=ann.key,
        tp=form_types_inv.get(ann.type, '?')
    ))
    children += [header]

    if annotate_fields:
        pages = []
        names = get_field_names(get_fields_to_annotate(ann.form))
        if len(names) > max_fields:
            children += [
                widgets.HTML("<h4>Too many fields ({})</h4>".format(len(names)))
            ]
        else:
            for name in names:
                field_type_select = FieldTypeSelect(ann, name)
                html_view = HtmlView(ann.form, name)
                page = widgets.Box(children=[field_type_select, html_view])
                pages.append(page)

            field_tabs = widgets.Tab(children=pages, padding=4)
            for idx, name in enumerate(names):
                field_tabs.set_title(idx, name)

            children += [field_tabs]
    else:
        children += [HtmlView(ann.form)]

    return widgets.VBox(children, padding=8)
コード例 #16
0
ファイル: classifiers.py プロジェクト: charx0r/formasaurus
 def classify(self, form):
     """
     Return ``{'form': 'type', 'fields': {'name': 'type', ...}}``
     dict with form type and types of its visible submittable fields.
     """
     form_type = self.form_classifier.classify(form)
     field_elems = get_fields_to_annotate(form)
     xseq = fieldtype_model.get_form_features(form, form_type, field_elems)
     yseq = self._field_model.predict_single(xseq)
     return {
         'form': form_type,
         'fields': {
             elem.name: cls
             for elem, cls in zip(field_elems, yseq)
         }
     }
コード例 #17
0
    def classify(self, form, fields=True):
        """
        Return ``{'form': 'type', 'fields': {'name': 'type', ...}}``
        dict with form type and types of its visible submittable fields.

        If ``fields`` argument is False, only information about form type is
        returned: ``{'form': 'type'}``.
        """
        form_type = self.form_classifier.classify(form)
        res = {'form': form_type}
        if fields:
            field_elems = get_fields_to_annotate(form)
            xseq = fieldtype_model.get_form_features(form, form_type, field_elems)
            yseq = self._field_model.predict_single(xseq)
            res['fields'] = {
                elem.name: cls
                for elem, cls in zip(field_elems, yseq)
            }
        return res
コード例 #18
0
    def classify_proba(self, form, threshold=0.0, fields=True):
        """
        Return dict with probabilities of ``form`` and its fields belonging
        to various form and field classes::

            {
                'form': {'type1': prob1, 'type2': prob2, ...},
                'fields': {
                    'name': {'type1': prob1, 'type2': prob2, ...},
                    ...
                }
            }

        ``form`` should be an lxml HTML <form> element.
        Only classes with probability >= ``threshold`` are preserved.

        If ``fields`` is False, only information about the form is returned::

            {
                'form': {'type1': prob1, 'type2': prob2, ...}
            }

        """
        form_types_proba = self.form_classifier.classify_proba(form, threshold)
        res = {'form': form_types_proba}

        if fields:
            form_type = max(form_types_proba,
                            key=lambda p: form_types_proba[p])
            field_elems = get_fields_to_annotate(form)
            xseq = fieldtype_model.get_form_features(form, form_type,
                                                     field_elems)
            yseq = self._field_model.predict_marginals_single(xseq)
            res['fields'] = {
                elem.name: thresholded(probs, threshold)
                for elem, probs in zip(field_elems, yseq)
            }

        return res
コード例 #19
0
def test_get_text_around_elems():
    tree = load_html("""
        <form>
            <h1>Login</h1>
            Please <b>enter</b> your details
            <p>
                Username: <input name='username'/> required
                <div>Email:</div> <input type='text' name='email'> *
            </p>
            Thanks!
        </form>
    """)
    elems = get_fields_to_annotate(tree)
    user, email = elems
    before, after = get_text_around_elems(tree, elems)
    assert len(before) == 2
    assert before[user] == 'Login  Please  enter  your details  Username:'******'required  Email:'

    assert len(after) == 2
    assert after[user] == 'required  Email:'
    assert after[email] == '* Thanks!'

    get_text_around_elems(tree, []) == {}, {}
コード例 #20
0
def test_get_text_around_elems():
    tree = load_html("""
        <form>
            <h1>Login</h1>
            Please <b>enter</b> your details
            <p>
                Username: <input name='username'/> required
                <div>Email:</div> <input type='text' name='email'> *
            </p>
            Thanks!
        </form>
    """)
    elems = get_fields_to_annotate(tree)
    user, email = elems
    before, after = get_text_around_elems(tree, elems)
    assert len(before) == 2
    assert before[user] == 'Login  Please  enter  your details  Username:'******'required  Email:'

    assert len(after) == 2
    assert after[user] == 'required  Email:'
    assert after[email] == '* Thanks!'

    get_text_around_elems(tree, []) == {}, {}