def get_form_features(form, form_type, field_elems=None):
    """
    Return a list of feature dicts, a dict per visible submittable
    field in a <form> element.
    """
    if field_elems is None:
        field_elems = get_fields_to_annotate(form)
    text_before, text_after = get_text_around_elems(form, field_elems)
    res = [_elem_features(elem) for elem in field_elems]

    for idx, elem_feat in enumerate(res):
        if idx == 0:
            elem_feat['is-first'] = True
        if idx == len(res)-1:
            elem_feat['is-last'] = True

        elem_feat['form-type'] = form_type
        # get text before element
        text = normalize(text_before[field_elems[idx]])
        tokens = tokenize(text)[-6:]
        elem_feat['text-before'] = token_ngrams(tokens, 1, 2)

        # get text after element
        text = normalize(text_after[field_elems[idx]])
        tokens = tokenize(text)[:5]
        elem_feat['text-after'] = token_ngrams(tokens, 1, 2)
        elem_feat['bias'] = 1

    return res
def get_form_features(form, form_type, field_elems=None):
    """
    Return a list of feature dicts, a dict per visible submittable
    field in a <form> element.
    """
    if field_elems is None:
        field_elems = get_fields_to_annotate(form)
    text_before, text_after = get_text_around_elems(form, field_elems)
    res = [_elem_features(elem) for elem in field_elems]

    for idx, elem_feat in enumerate(res):
        if idx == 0:
            elem_feat['is-first'] = True
        if idx == len(res) - 1:
            elem_feat['is-last'] = True

        elem_feat['form-type'] = form_type
        # get text before element
        text = normalize(text_before[field_elems[idx]])
        tokens = tokenize(text)[-6:]
        elem_feat['text-before'] = token_ngrams(tokens, 1, 2)

        # get text after element
        text = normalize(text_after[field_elems[idx]])
        tokens = tokenize(text)[:5]
        elem_feat['text-after'] = token_ngrams(tokens, 1, 2)
        elem_feat['bias'] = 1

    return res
def test_get_text_around_elems():
    tree = load_html("""
        <form>
            <h1>Login</h1>
            Please <b>enter</b> your details
            <p>
                Username: <input name='username'/> required
                <div>Email:</div> <input type='text' name='email'> *
            </p>
            Thanks!
        </form>
    """)
    elems = get_fields_to_annotate(tree)
    user, email = elems
    before, after = get_text_around_elems(tree, elems)
    assert len(before) == 2
    assert before[user] == 'Login  Please  enter  your details  Username:'******'required  Email:'

    assert len(after) == 2
    assert after[user] == 'required  Email:'
    assert after[email] == '* Thanks!'

    get_text_around_elems(tree, []) == {}, {}
def test_get_text_around_elems():
    tree = load_html("""
        <form>
            <h1>Login</h1>
            Please <b>enter</b> your details
            <p>
                Username: <input name='username'/> required
                <div>Email:</div> <input type='text' name='email'> *
            </p>
            Thanks!
        </form>
    """)
    elems = get_fields_to_annotate(tree)
    user, email = elems
    before, after = get_text_around_elems(tree, elems)
    assert len(before) == 2
    assert before[user] == 'Login  Please  enter  your details  Username:'******'required  Email:'

    assert len(after) == 2
    assert after[user] == 'required  Email:'
    assert after[email] == '* Thanks!'

    get_text_around_elems(tree, []) == {}, {}