def get_form_features(form, form_type, field_elems=None): """ Return a list of feature dicts, a dict per visible submittable field in a <form> element. """ if field_elems is None: field_elems = get_fields_to_annotate(form) text_before, text_after = get_text_around_elems(form, field_elems) res = [_elem_features(elem) for elem in field_elems] for idx, elem_feat in enumerate(res): if idx == 0: elem_feat['is-first'] = True if idx == len(res)-1: elem_feat['is-last'] = True elem_feat['form-type'] = form_type # get text before element text = normalize(text_before[field_elems[idx]]) tokens = tokenize(text)[-6:] elem_feat['text-before'] = token_ngrams(tokens, 1, 2) # get text after element text = normalize(text_after[field_elems[idx]]) tokens = tokenize(text)[:5] elem_feat['text-after'] = token_ngrams(tokens, 1, 2) elem_feat['bias'] = 1 return res
def get_form_features(form, form_type, field_elems=None): """ Return a list of feature dicts, a dict per visible submittable field in a <form> element. """ if field_elems is None: field_elems = get_fields_to_annotate(form) text_before, text_after = get_text_around_elems(form, field_elems) res = [_elem_features(elem) for elem in field_elems] for idx, elem_feat in enumerate(res): if idx == 0: elem_feat['is-first'] = True if idx == len(res) - 1: elem_feat['is-last'] = True elem_feat['form-type'] = form_type # get text before element text = normalize(text_before[field_elems[idx]]) tokens = tokenize(text)[-6:] elem_feat['text-before'] = token_ngrams(tokens, 1, 2) # get text after element text = normalize(text_after[field_elems[idx]]) tokens = tokenize(text)[:5] elem_feat['text-after'] = token_ngrams(tokens, 1, 2) elem_feat['bias'] = 1 return res
def _elem_features(elem): elem_name = normalize(elem.name) elem_value = _elem_attr(elem, "value") elem_placeholder = _elem_attr(elem, "placeholder") elem_css_class = _elem_attr(elem, "class") elem_id = _elem_attr(elem, "id") elem_title = _elem_attr(elem, "title") feat = { "tag": elem.tag, "name": tokenize(elem_name), "name-ngrams-3-5": ngrams(elem_name, 3, 5), "value": ngrams(elem_value, 5, 5), "value-ngrams": ngrams(elem_value, 5, 5), "css-class-ngrams": ngrams(elem_css_class, 5, 5), "help": tokenize(elem_title + " " + elem_placeholder), "id-ngrams": ngrams(elem_id, 4, 4), "id": tokenize(elem_id), } label = elem.label if label is not None: label_text = normalize(label.text_content()) feat["label"] = tokenize(label_text) feat["label-ngrams-3-5"] = ngrams(label_text, 3, 5) if elem.tag == "input": feat["input-type"] = elem.get("type", "text").lower() if elem.tag == "select": feat["option-text"] = [normalize(v) for v in elem.xpath("option//text()")] feat["option-value"] = [normalize(el.get("value", "")) for el in elem.xpath("option")] feat["option-num-pattern"] = list({number_pattern(v) for v in feat["option-text"] + feat["option-value"]}) return feat
def _elem_features(elem): elem_name = normalize(elem.name) elem_value = _elem_attr(elem, 'value') elem_placeholder = _elem_attr(elem, 'placeholder') elem_css_class = _elem_attr(elem, 'class') elem_id = _elem_attr(elem, 'id') elem_title = _elem_attr(elem, 'title') feat = { 'tag': elem.tag, 'name': tokenize(elem_name), 'name-ngrams-3-5': ngrams(elem_name, 3, 5), 'value': ngrams(elem_value, 5, 5), 'value-ngrams': ngrams(elem_value, 5, 5), 'css-class-ngrams': ngrams(elem_css_class, 5, 5), 'help': tokenize(elem_title + " " + elem_placeholder), 'id-ngrams': ngrams(elem_id, 4, 4), 'id': tokenize(elem_id), } label = elem.label if label is not None: label_text = normalize(label.text_content()) feat['label'] = tokenize(label_text) feat['label-ngrams-3-5'] = ngrams(label_text, 3, 5) if elem.tag == 'input': feat['input-type'] = elem.get('type', 'text').lower() if elem.tag == 'select': feat['option-text'] = [ normalize(v) for v in elem.xpath('option//text()') ] feat['option-value'] = [ normalize(el.get('value', '')) for el in elem.xpath('option') ] feat['option-num-pattern'] = list({ number_pattern(v) for v in feat['option-text'] + feat['option-value'] }) return feat
def _elem_features(elem): elem_name = normalize(elem.name) elem_value = _elem_attr(elem, 'value') elem_placeholder = _elem_attr(elem, 'placeholder') elem_css_class = _elem_attr(elem, 'class') elem_id = _elem_attr(elem, 'id') elem_title = _elem_attr(elem, 'title') feat = { 'tag': elem.tag, 'name': tokenize(elem_name), 'name-ngrams-3-5': ngrams(elem_name, 3, 5), 'value': ngrams(elem_value, 5, 5), 'value-ngrams': ngrams(elem_value, 5, 5), 'css-class-ngrams': ngrams(elem_css_class, 5, 5), 'help': tokenize(elem_title + " " + elem_placeholder), 'id-ngrams': ngrams(elem_id, 4, 4), 'id': tokenize(elem_id), } label = elem.label if label is not None: label_text = normalize(label.text_content()) feat['label'] = tokenize(label_text) feat['label-ngrams-3-5'] = ngrams(label_text, 3, 5) if elem.tag == 'input': feat['input-type'] = elem.get('type', 'text').lower() if elem.tag == 'select': feat['option-text'] = [normalize(v) for v in elem.xpath('option//text()')] feat['option-value'] = [normalize(el.get('value', '')) for el in elem.xpath('option')] feat['option-num-pattern'] = list( {number_pattern(v) for v in feat['option-text'] + feat['option-value']} ) return feat
def _link_inside_text(link: Dict) -> str: text = link.get('inside_text', '') title = link.get('attrs', {}).get('title', '') return normalize(text + ' ' + title)
def test_normalize(): assert normalize("Hello,\n world!") == "hello, world!"
def _elem_attr(elem, attr): return normalize(elem.get(attr, ''))