def get_form_features(form, form_type, field_elems=None):
    """
    Return a list of feature dicts, a dict per visible submittable
    field in a <form> element.
    """
    if field_elems is None:
        field_elems = get_fields_to_annotate(form)
    text_before, text_after = get_text_around_elems(form, field_elems)
    res = [_elem_features(elem) for elem in field_elems]

    for idx, elem_feat in enumerate(res):
        if idx == 0:
            elem_feat['is-first'] = True
        if idx == len(res)-1:
            elem_feat['is-last'] = True

        elem_feat['form-type'] = form_type
        # get text before element
        text = normalize(text_before[field_elems[idx]])
        tokens = tokenize(text)[-6:]
        elem_feat['text-before'] = token_ngrams(tokens, 1, 2)

        # get text after element
        text = normalize(text_after[field_elems[idx]])
        tokens = tokenize(text)[:5]
        elem_feat['text-after'] = token_ngrams(tokens, 1, 2)
        elem_feat['bias'] = 1

    return res
def get_form_features(form, form_type, field_elems=None):
    """
    Return a list of feature dicts, a dict per visible submittable
    field in a <form> element.
    """
    if field_elems is None:
        field_elems = get_fields_to_annotate(form)
    text_before, text_after = get_text_around_elems(form, field_elems)
    res = [_elem_features(elem) for elem in field_elems]

    for idx, elem_feat in enumerate(res):
        if idx == 0:
            elem_feat['is-first'] = True
        if idx == len(res) - 1:
            elem_feat['is-last'] = True

        elem_feat['form-type'] = form_type
        # get text before element
        text = normalize(text_before[field_elems[idx]])
        tokens = tokenize(text)[-6:]
        elem_feat['text-before'] = token_ngrams(tokens, 1, 2)

        # get text after element
        text = normalize(text_after[field_elems[idx]])
        tokens = tokenize(text)[:5]
        elem_feat['text-after'] = token_ngrams(tokens, 1, 2)
        elem_feat['bias'] = 1

    return res
def _elem_features(elem):
    elem_name = normalize(elem.name)
    elem_value = _elem_attr(elem, "value")
    elem_placeholder = _elem_attr(elem, "placeholder")
    elem_css_class = _elem_attr(elem, "class")
    elem_id = _elem_attr(elem, "id")
    elem_title = _elem_attr(elem, "title")

    feat = {
        "tag": elem.tag,
        "name": tokenize(elem_name),
        "name-ngrams-3-5": ngrams(elem_name, 3, 5),
        "value": ngrams(elem_value, 5, 5),
        "value-ngrams": ngrams(elem_value, 5, 5),
        "css-class-ngrams": ngrams(elem_css_class, 5, 5),
        "help": tokenize(elem_title + " " + elem_placeholder),
        "id-ngrams": ngrams(elem_id, 4, 4),
        "id": tokenize(elem_id),
    }
    label = elem.label
    if label is not None:
        label_text = normalize(label.text_content())
        feat["label"] = tokenize(label_text)
        feat["label-ngrams-3-5"] = ngrams(label_text, 3, 5)

    if elem.tag == "input":
        feat["input-type"] = elem.get("type", "text").lower()

    if elem.tag == "select":
        feat["option-text"] = [normalize(v) for v in elem.xpath("option//text()")]
        feat["option-value"] = [normalize(el.get("value", "")) for el in elem.xpath("option")]
        feat["option-num-pattern"] = list({number_pattern(v) for v in feat["option-text"] + feat["option-value"]})

    return feat
def _elem_features(elem):
    elem_name = normalize(elem.name)
    elem_value = _elem_attr(elem, 'value')
    elem_placeholder = _elem_attr(elem, 'placeholder')
    elem_css_class = _elem_attr(elem, 'class')
    elem_id = _elem_attr(elem, 'id')
    elem_title = _elem_attr(elem, 'title')

    feat = {
        'tag': elem.tag,
        'name': tokenize(elem_name),
        'name-ngrams-3-5': ngrams(elem_name, 3, 5),
        'value': ngrams(elem_value, 5, 5),
        'value-ngrams': ngrams(elem_value, 5, 5),
        'css-class-ngrams': ngrams(elem_css_class, 5, 5),
        'help': tokenize(elem_title + " " + elem_placeholder),
        'id-ngrams': ngrams(elem_id, 4, 4),
        'id': tokenize(elem_id),
    }
    label = elem.label
    if label is not None:
        label_text = normalize(label.text_content())
        feat['label'] = tokenize(label_text)
        feat['label-ngrams-3-5'] = ngrams(label_text, 3, 5)

    if elem.tag == 'input':
        feat['input-type'] = elem.get('type', 'text').lower()

    if elem.tag == 'select':
        feat['option-text'] = [
            normalize(v) for v in elem.xpath('option//text()')
        ]
        feat['option-value'] = [
            normalize(el.get('value', '')) for el in elem.xpath('option')
        ]
        feat['option-num-pattern'] = list({
            number_pattern(v)
            for v in feat['option-text'] + feat['option-value']
        })

    return feat
def _elem_features(elem):
    elem_name = normalize(elem.name)
    elem_value = _elem_attr(elem, 'value')
    elem_placeholder = _elem_attr(elem, 'placeholder')
    elem_css_class = _elem_attr(elem, 'class')
    elem_id = _elem_attr(elem, 'id')
    elem_title = _elem_attr(elem, 'title')

    feat = {
        'tag': elem.tag,
        'name': tokenize(elem_name),
        'name-ngrams-3-5': ngrams(elem_name, 3, 5),
        'value': ngrams(elem_value, 5, 5),
        'value-ngrams': ngrams(elem_value, 5, 5),
        'css-class-ngrams': ngrams(elem_css_class, 5, 5),
        'help': tokenize(elem_title + " " + elem_placeholder),
        'id-ngrams': ngrams(elem_id, 4, 4),
        'id': tokenize(elem_id),
    }
    label = elem.label
    if label is not None:
        label_text = normalize(label.text_content())
        feat['label'] = tokenize(label_text)
        feat['label-ngrams-3-5'] = ngrams(label_text, 3, 5)

    if elem.tag == 'input':
        feat['input-type'] = elem.get('type', 'text').lower()

    if elem.tag == 'select':
        feat['option-text'] = [normalize(v) for v in elem.xpath('option//text()')]
        feat['option-value'] = [normalize(el.get('value', '')) for el in elem.xpath('option')]
        feat['option-num-pattern'] = list(
            {number_pattern(v) for v in feat['option-text'] + feat['option-value']}
        )

    return feat
Exemple #6
0
def _link_inside_text(link: Dict) -> str:
    text = link.get('inside_text', '')
    title = link.get('attrs', {}).get('title', '')
    return normalize(text + ' ' + title)
Exemple #7
0
def test_normalize():
    assert normalize("Hello,\n  world!") == "hello, world!"
def _elem_attr(elem, attr):
    return normalize(elem.get(attr, ''))
def _elem_attr(elem, attr):
    return normalize(elem.get(attr, ''))