def get_form_features(form, form_type, field_elems=None): """ Return a list of feature dicts, a dict per visible submittable field in a <form> element. """ if field_elems is None: field_elems = get_fields_to_annotate(form) text_before, text_after = get_text_around_elems(form, field_elems) res = [_elem_features(elem) for elem in field_elems] for idx, elem_feat in enumerate(res): if idx == 0: elem_feat['is-first'] = True if idx == len(res)-1: elem_feat['is-last'] = True elem_feat['form-type'] = form_type # get text before element text = normalize(text_before[field_elems[idx]]) tokens = tokenize(text)[-6:] elem_feat['text-before'] = token_ngrams(tokens, 1, 2) # get text after element text = normalize(text_after[field_elems[idx]]) tokens = tokenize(text)[:5] elem_feat['text-after'] = token_ngrams(tokens, 1, 2) elem_feat['bias'] = 1 return res
def get_form_features(form, form_type, field_elems=None): """ Return a list of feature dicts, a dict per visible submittable field in a <form> element. """ if field_elems is None: field_elems = get_fields_to_annotate(form) text_before, text_after = get_text_around_elems(form, field_elems) res = [_elem_features(elem) for elem in field_elems] for idx, elem_feat in enumerate(res): if idx == 0: elem_feat['is-first'] = True if idx == len(res) - 1: elem_feat['is-last'] = True elem_feat['form-type'] = form_type # get text before element text = normalize(text_before[field_elems[idx]]) tokens = tokenize(text)[-6:] elem_feat['text-before'] = token_ngrams(tokens, 1, 2) # get text after element text = normalize(text_after[field_elems[idx]]) tokens = tokenize(text)[:5] elem_feat['text-after'] = token_ngrams(tokens, 1, 2) elem_feat['bias'] = 1 return res
def test_get_text_around_elems(): tree = load_html(""" <form> <h1>Login</h1> Please <b>enter</b> your details <p> Username: <input name='username'/> required <div>Email:</div> <input type='text' name='email'> * </p> Thanks! </form> """) elems = get_fields_to_annotate(tree) user, email = elems before, after = get_text_around_elems(tree, elems) assert len(before) == 2 assert before[user] == 'Login Please enter your details Username:'******'required Email:' assert len(after) == 2 assert after[user] == 'required Email:' assert after[email] == '* Thanks!' get_text_around_elems(tree, []) == {}, {}