Beispiel #1
0
def stage2_step_classify_postprocess(elements, ctx):
    _elements = list(filter_tag(elements))
    _elements.reverse()
    _elements_count = len(_elements)

    if _elements_count < 1:
        return

    for element in _elements:
        el_info = get_node_info(element, ctx.nodestats, no_insert=True)
        if el_info.dbce_class is None:
            el_info.dbce_class = DBCE_CLASS_UNKNWON
            node_stats(element, ctx)

    (_classes, _classes_cnt) = get_class_stats(_elements, ctx)
    percent_by_class = compute_class_distribution(_elements, ctx)

    logger.debug("# stage #1: ids  ({}): {}".format(_elements_count,
                                                    [(_.name, id(_))
                                                     for _ in _elements]))
    logger.debug("# stage #1: classes: {} / {}".format(_classes, _classes_cnt))
    logger.debug("# stage #1: distribution: {}".format(percent_by_class))

    if _elements_count == 1 or is_list_equal(_classes):
        _el0_info = get_node_info(_elements[0], ctx.nodestats, no_insert=True)
        if _el0_info.dbce_class != DBCE_CLASS_UNKNWON:
            set_nodes_class(_el0_info.parent, _el0_info.dbce_class, ctx)
        node_stats(_elements[0], ctx)
    else:
        logger.warning(
            "# stage #1: ec: {} and classes not equal".format(_elements_count))
Beispiel #2
0
def node_stats(node, ctx):
    node_info = get_node_info(node, ctx.nodestats, no_insert=True)
    if node_info.stats_done:
        return

    el_stats = node_info.stats

    for child in node.children:
        if is_string(child):
            for token in child.string.split():
                el_stats['word_cnt'] += 1
                el_stats['text_len'] += len(token.strip()) + 1
            el_stats['num_wraped_lines'] = round(el_stats['text_len']/80) \
                                           + (1 if  el_stats['text_len']%80 else 0)
        elif is_tag(child, any_tag=True):
            if child.name not in [DBCE_MARKUP_ANALYSIS_TAG, 'br']:
                el_stats['tag_cnt'] += 1
            if child.name == 'a':
                el_stats['href_cnt'] += 1
                el_stats['text_in_links'] += len(child.text.strip().split())

        if isinstance(child, bs4.element.Tag):
            child_info = get_node_info(child, ctx.nodestats, no_insert=True)
            node_info.sum_stats(child_info)

    node_info.w2t_ratio = ratio(el_stats['word_cnt'], el_stats['tag_cnt'])
    node_info.t2w_ratio = ratio(el_stats['tag_cnt'], el_stats['word_cnt'])
    node_info.link_density = ratio(el_stats['text_in_links'],
                                   el_stats['word_cnt'])
    node_info.text_density = ratio(el_stats['word_cnt'],
                                   el_stats['num_wraped_lines'])
    node_info.stats_done = True
Beispiel #3
0
def _init_stats(ctx, soup):
    nodestats = ctx.nodestats
    layers = get_layers(soup)
    for layer in sorted(layers, reverse=True):
        for nodes in layers[layer]:
            for node in nodes:
                if isinstance(node, bs4.element.Tag):
                    ctx.seen_tags.add(node.name)
                    get_node_info(node, nodestats)
Beispiel #4
0
def get_direct_vector(ctx, prev_node, cur_node, next_node):
    dummy_node = get_node_info(get_dummy_node(ctx),
                               ctx.nodestats,
                               no_insert=True)

    if not prev_node:
        prev_node = dummy_node
    if not next_node:
        next_node = dummy_node
    _known_tags = sorted(list(ctx.seen_tags))
    _node_class = -1

    if cur_node.node_name != 'dbce':
        _cls = cur_node.node.attrs.get('class', [])
        if len(_cls):
            _cls.sort()
            _node_class = hash(" ".join(_cls))

    vector = (_known_tags.index(prev_node.node_name),
              _known_tags.index(cur_node.node_name),
              _known_tags.index(
                  next_node.node_name), _node_class, prev_node.link_density,
              prev_node.text_density, cur_node.link_density,
              cur_node.text_density, next_node.link_density,
              next_node.text_density, prev_node.stats['word_cnt'],
              cur_node.stats['word_cnt'], next_node.stats['word_cnt'],
              prev_node.stats['text_len'], cur_node.stats['text_len'],
              next_node.stats['text_len'], prev_node.stats['tag_cnt'],
              cur_node.stats['tag_cnt'], next_node.stats['tag_cnt'],
              ratio(cur_node.stats['text_len'], cur_node.stats['tag_cnt']))
    return vector
Beispiel #5
0
def extract_content(soup, ctx):
    out = []
    for node in filter_tag(soup.body.recursiveChildGenerator(), ['dbce']):
        if is_content(get_node_info(node, ctx.nodestats, no_insert=True)):
            for _ in textwrap.wrap(node.text.strip(), 70):
                out.append(_)

    return "\n".join(out)
Beispiel #6
0
def extend_nodes_class(node, ctx):
    if 'class' not in node.attrs:
        node.attrs['class'] = []
    node_info = get_node_info(node, ctx.nodestats, no_insert=True)

    _insert_class_value(node, node_info.dbce_class)
    _insert_class_value(node, 'dbce-marker')
    assert None not in getattr(node.attrs, 'class', [])
Beispiel #7
0
def classify_elements_stage3(elements, ctx, clf=None, testname=None):
    _classes = []

    # all nodes share the same parent
    if len(elements) < 1:
        logger.warning("not enough elements...")
        return

    parent = get_node_info(elements[0].parent, ctx.nodestats, no_insert=True)
    _vectors = []
    for (prev_node, cur_node,
         next_node) in iterate_over_elements(elements, ctx):
        # logger.debug("# PARENT dbce link/text density: {} ({})/ {}".format(
        #     id(parent.node),
        #     parent.node_name,
        #     (parent.link_density, parent.text_density)))

        # logger.debug("# nodes       : {} / {} ({}) / {}".format(prev_node.node_name,
        #                                                         cur_node.node_name,
        #                                                         id(cur_node),
        #                                                         next_node.node_name))
        # logger.debug("# dbce classes: {} / {} / {}".format(prev_node.dbce_class,
        #                                                    cur_node.dbce_class,
        #                                                    next_node.dbce_class))
        # logger.debug("# dbce link/text density: {} / {} / {}".format(
        #     (prev_node.link_density, prev_node.text_density),
        #     (cur_node.link_density, cur_node.text_density),
        #     (next_node.link_density, next_node.text_density)))

        # logger.debug("# " + "-"*20)

        _vec = get_vector(ctx,
                          prev_node,
                          cur_node,
                          next_node,
                          parent,
                          testname=testname)

        # logging even preparation of the log message is to expensive
        if logger.level == logging.DEBUG:
            logger.debug(" vector : {}".format(_vec))
        _vectors.append(_vec)

    _oracle = clf.predict(_vectors)
    if logger.level == logging.DEBUG:
        logger.debug(" oracle: {}".format(_oracle))

    def r2class(result):
        if result == 1:
            return DBCE_CLASS_CONTENT
        else:
            return DBCE_CLASS_BOILER_PLATE

    re_classes = map(r2class, _oracle)
    return re_classes
Beispiel #8
0
def get_parent_environ(ctx, parent):
    if not parent:
        return None

    if parent.node_name in ['a']:
        return get_parent_environ(
            ctx, get_node_info(parent.parent, ctx.nodestats, no_insert=True))

    prev_sib = None
    next_sib = None

    for el in filter_tag(parent.node.previousSiblingGenerator()):
        prev_sib = get_node_info(el, ctx.nodestats, no_insert=True)
        break

    for el in filter_tag(parent.node.nextSiblingGenerator()):
        next_sib = get_node_info(el, ctx.nodestats, no_insert=True)
        break

    return (prev_sib, next_sib)
Beispiel #9
0
def iterate_over_elements(elements, ctx):
    nodes_iter = iter(elements)
    dummy_node = get_dummy_node(ctx)
    prev_node = cur_node = next_node = dummy_node

    try:
        cur_node = next(nodes_iter)
    except StopIteration:
        pass
    try:
        next_node = next(nodes_iter)
    except StopIteration:
        pass

    yield (get_node_info(prev_node, ctx.nodestats, no_insert=True),
           get_node_info(cur_node, ctx.nodestats, no_insert=True),
           get_node_info(next_node, ctx.nodestats, no_insert=True))

    if next_node != dummy_node:
        stop = False
        while not stop:
            prev_node = cur_node
            cur_node = next_node
            try:
                next_node = next(nodes_iter)
            except StopIteration:
                next_node = dummy_node
                stop = True
            yield (get_node_info(prev_node, ctx.nodestats, no_insert=True),
                   get_node_info(cur_node, ctx.nodestats, no_insert=True),
                   get_node_info(next_node, ctx.nodestats, no_insert=True))
Beispiel #10
0
def extend_nodes_info(ctx, element):
    node_info = get_node_info(element, ctx.nodestats, no_insert=True)
    nd_stat = node_info.stats
    node_info.add_html_info['dbce-class'] = node_info.dbce_class
    node_info.add_html_info['dbce-wc'] = nd_stat['word_cnt']
    node_info.add_html_info['dbce-tc'] = nd_stat['tag_cnt']
    node_info.add_html_info['dbce-hrefs'] = nd_stat['href_cnt']
    node_info.add_html_info['dbce-w2t'] = node_info.w2t_ratio
    node_info.add_html_info['dbce-t2w'] = node_info.t2w_ratio
    node_info.add_html_info['dbce-lden'] = node_info.link_density
    node_info.add_html_info['dbce-tden'] = node_info.text_density
    node_info.add_html_info['dbce-id'] = node_info.nodeid
    if node_info.preprocess_class:
        node_info.add_html_info['dbce-pp-class'] = element.preprocess_class
Beispiel #11
0
def set_nodes_class(node, cls, ctx, force=False):
    node_info = get_node_info(node, ctx.nodestats, no_insert=True)

    if node_info.dbce_class is not None and node_info.dbce_class != DBCE_CLASS_UNKNWON:
        logger.debug(
            "# you want me to change <{}> class? ({}) {} to {}".format(
                node_info.node_name, node_info.nodeid, node_info.dbce_class,
                cls))
        if node_info.dbce_class != cls and not force:
            raise ValueError("class change {} -> {} not allowed".format(
                node_info.dbce_class, cls))
    else:
        logger.debug("# you want me to set <{}> class? ({}) {} to {}".format(
            node_info.node_name, node_info.nodeid, node_info.dbce_class, cls))
    node_info.dbce_class = cls
    node_stats(node, ctx)
Beispiel #12
0
def collect_vectors(ctx, soup):
    vectors = OrderedDict()

    layers = get_layers(soup)
    for level in sorted(layers.keys(), reverse=True):
        for nodes in layers[level]:
            _elements = list(filter_tag(nodes))
            if len(_elements) == 0:
                continue

            # all elements have the same parent so let's get it only once
            # bs4 is awful slow __getattr__.find() ...
            parent = get_node_info(_elements[0].parent,
                                   ctx.nodestats,
                                   no_insert=True)
            get_xpath(parent.node, ctx)
            for (prev_node, cur_node,
                 next_node) in iterate_over_elements(_elements, ctx):

                get_xpath(cur_node.node, ctx)
                if not (prev_node.dbce_class in [DBCE_CLASS_CONTENT, DBCE_CLASS_BOILER_PLATE] \
                        or cur_node.dbce_class in [DBCE_CLASS_CONTENT, DBCE_CLASS_BOILER_PLATE] \
                        or next_node.dbce_class in [DBCE_CLASS_CONTENT, DBCE_CLASS_BOILER_PLATE]):
                    continue

                cur_node.clf_vector = get_vector(ctx, prev_node, cur_node,
                                                 next_node, parent)
                _ = vectors.get(cur_node.clf_vector, False)
                if is_content(cur_node):
                    if _ and _ != 1:
                        logger.error("vector conflict bp: {}".format(
                            cur_node.clf_vector))
                        get_parent_environ(ctx, parent)
                        del vectors[cur_node.clf_vector]
                        continue
                    vectors[cur_node.clf_vector] = (cur_node.xpath, 1)
                else:
                    if _ and _ != 0:
                        logger.error("vector conflict content: {}".format(
                            cur_node.clf_vector))
                        get_parent_environ(ctx, parent)
                        del vectors[cur_node.clf_vector]
                        continue
                    vectors[cur_node.clf_vector] = (cur_node.xpath, 0)

    return vectors
Beispiel #13
0
def wrap_with_div(soup, ctx, versions=None, clean_markup=False, use_dbce=True):
    """ Retruns BeautifulSoup object in which the <dbce></dbce> tags
    has been wrapped with <div><dbce></dbce></div>. If clean_markup is True
    then <dbce></dbce> tags will be removed"""

    if versions:
        _new_tag = soup.new_tag('dbce-info')
        _new_tag['class'] = []
        _bits = [_ for _ in map(str, versions)]
        _insert_class_value(_new_tag, "dbce-bits-{}".format(''.join(_bits)))
        soup.body.insert(0, _new_tag)

    for todo in soup.find_all(DBCE_MARKUP_ANALYSIS_TAG):
        todo_info = get_node_info(todo, ctx.nodestats, no_insert=True)

        _new_tag = soup.new_tag(DBCE_MARKUP_PRESENTATION_TAG)
        _new_tag['class'] = []
        _node_bits = marker_to_bits(todo['class'])
        if use_dbce:
            _content_class = DBCE_CLASS_UNKNWON
            if versions:
                _content_class = classify_block(versions, _node_bits)
            if _content_class == DBCE_CLASS_UNKNWON:
                _copy_class_values(todo, _new_tag)
        else:
            if todo_info.dbce_class is not None:
                _content_class = todo_info.dbce_class
                _insert_class_value(_new_tag, 'dbce-no-bits')
            else:
                raise ValueError(
                    "{}.dbce_class should be defined at this stage".format(
                        todo_info.nodeid))

        _insert_class_value(_new_tag, _content_class)
        _insert_class_value(_new_tag, 'dbce-marker')

        _insert_class_value(
            _new_tag, "dbce-bits-{}".format(''.join(map(str, _node_bits))))
        todo.wrap(_new_tag)

        if clean_markup:
            todo.unwrap()

    return soup
Beispiel #14
0
def stage2_step_classify(elements, ctx, testname, versions=None):
    """This function iterates over all elements of one group (same parent)
    and classifies <dbce> nodes based on the bit patterns.
    """
    _elements = list(filter_tag(elements))
    invalid_as_bp = True if testname in ['T002'] else False
    for element in _elements:
        if is_tag(element, [DBCE_MARKUP_ANALYSIS_TAG]):
            _class = classify_block(
                versions,
                marker_to_bits(element['class']),
            )
            if invalid_as_bp and _class == DBCE_CLASS_INVALID:
                _class = DBCE_CLASS_BOILER_PLATE
            el_info = get_node_info(element, ctx.nodestats)
            get_xpath(element, ctx)
            set_nodes_class(element, _class, ctx)
            if el_info.preprocess_class is None:
                el_info.preprocess_class = _class
            else:
                raise ValueError
            preprocess_fix_child(element, ctx)
Beispiel #15
0
def get_vector(ctx,
               prev_node,
               cur_node,
               next_node,
               parent=None,
               testname=None):

    (p_prev, p_next) = get_parent_environ(ctx, parent)
    level0_vect = get_direct_vector(ctx, prev_node, cur_node, next_node)

    _ret_val = level0_vect

    if ctx.testname in [None, 'T004', 'T006']:
        level1_vect = get_direct_vector(ctx, p_prev, parent, p_next)
        _ret_val += level1_vect
        if ctx.testname not in [None, 'T005', 'T006']:
            return _ret_val

    if ctx.testname in [None, 'T005', 'T006']:
        (p_prev, p_next) = get_parent_environ(
            ctx, get_node_info(parent.parent, ctx.nodestats, no_insert=True))
        level2_vect = get_direct_vector(ctx, p_prev, parent, p_next)
        _ret_val += level2_vect
    return _ret_val
Beispiel #16
0
def stage2_step_reclassify(elements, ctx, clf=None, testname=None):
    _elements = list(filter_tag(elements))
    _elements.reverse()

    if len(_elements) < 1:
        return

    (_classes, _classes_cnt) = get_class_stats(_elements, ctx)
    _all_class_eq = is_list_equal(_classes)
    _el0_info = get_node_info(_elements[0], ctx.nodestats, no_insert=True)
    _el0_parent_info = get_node_info(_el0_info.parent,
                                     ctx.nodestats,
                                     no_insert=True)

    # FIXME: check if this is needed ... was copy&pasted from stage2
    #if _all_class_eq and _el0_parent_info.dbce_class == _classes[0] \
    #   and _classes[0] != DBCE_CLASS_UNKNWON:
    #    return

    # we have the same parent for all elements
    parent = _el0_parent_info.node
    logger.debug("# stage #3: classes: {} / {} / {}".format(
        _classes, _classes_cnt, _all_class_eq))
    # logger.debug("# stage #3: distribution: {}".format(percent_by_class))

    tmp_classes_lr = classify_elements_stage3(_elements,
                                              ctx,
                                              clf,
                                              testname=testname)

    changes = False
    for (node, cls) in zip(_elements, tmp_classes_lr):
        node_info = get_node_info(node, ctx.nodestats, no_insert=True)
        if cls is None:
            continue
        elif cls != node_info.dbce_class:
            # FIXME: define a testcase for this one or remove it
            # jusTex method
            logger.debug("# stage #3: reclassify id: {}".format(
                node_info.nodeid))
            if cls == DBCE_CLASS_BOILER_PLATE and node_info.parent.name in [
                    'h1', 'h2', 'h3', 'h4'
            ]:
                logger.error("take a look on this one - node_name is hX")
                #set_nodes_class(node, DBCE_CLASS_UNKNWON, ctx, force=True)
            else:
                # FIXME: make the change based on score?!
                set_nodes_class(node, cls, ctx, force=True)
            changes = True
        else:
            pass

    if not changes:
        return

    (_classes, _classes_cnt) = get_class_stats(_elements, ctx)
    _all_class_eq = is_list_equal(_classes)
    logger.debug("# stage #3: RE classes: {} / {} / {}".format(
        _classes, _classes_cnt, _all_class_eq))
    if _all_class_eq:
        logger.debug("# stage #3: RE all classes are equal... set up parent")
        set_nodes_class(parent,
                        get_node_info(_elements[0],
                                      ctx.nodestats,
                                      no_insert=True).dbce_class,
                        ctx,
                        force=True)
    else:
        # FIXME: this should be extended...
        logger.debug("# stage #3: just skip it")
Beispiel #17
0
def get_dummy_node(ctx):
    get_node_info(DUMMY_NODE, ctx.nodestats)
    return DUMMY_NODE
Beispiel #18
0
def preprocess_fix_child(element, ctx):
    el_class = get_node_info(element, ctx.nodestats, no_insert=True).dbce_class
    for child in list(filter_tag(element.children)):
        if get_node_info(child, ctx.nodestats, no_insert=True) is None:
            set_nodes_class(child, el_class, ctx)
Beispiel #19
0
def get_class_stats(elements, ctx):
    _classes = list([
        get_node_info(_, ctx.nodestats, no_insert=True).dbce_class
        for _ in elements
    ])
    return (_classes, Counter(_classes))