def stage2_step_classify_postprocess(elements, ctx): _elements = list(filter_tag(elements)) _elements.reverse() _elements_count = len(_elements) if _elements_count < 1: return for element in _elements: el_info = get_node_info(element, ctx.nodestats, no_insert=True) if el_info.dbce_class is None: el_info.dbce_class = DBCE_CLASS_UNKNWON node_stats(element, ctx) (_classes, _classes_cnt) = get_class_stats(_elements, ctx) percent_by_class = compute_class_distribution(_elements, ctx) logger.debug("# stage #1: ids ({}): {}".format(_elements_count, [(_.name, id(_)) for _ in _elements])) logger.debug("# stage #1: classes: {} / {}".format(_classes, _classes_cnt)) logger.debug("# stage #1: distribution: {}".format(percent_by_class)) if _elements_count == 1 or is_list_equal(_classes): _el0_info = get_node_info(_elements[0], ctx.nodestats, no_insert=True) if _el0_info.dbce_class != DBCE_CLASS_UNKNWON: set_nodes_class(_el0_info.parent, _el0_info.dbce_class, ctx) node_stats(_elements[0], ctx) else: logger.warning( "# stage #1: ec: {} and classes not equal".format(_elements_count))
def node_stats(node, ctx): node_info = get_node_info(node, ctx.nodestats, no_insert=True) if node_info.stats_done: return el_stats = node_info.stats for child in node.children: if is_string(child): for token in child.string.split(): el_stats['word_cnt'] += 1 el_stats['text_len'] += len(token.strip()) + 1 el_stats['num_wraped_lines'] = round(el_stats['text_len']/80) \ + (1 if el_stats['text_len']%80 else 0) elif is_tag(child, any_tag=True): if child.name not in [DBCE_MARKUP_ANALYSIS_TAG, 'br']: el_stats['tag_cnt'] += 1 if child.name == 'a': el_stats['href_cnt'] += 1 el_stats['text_in_links'] += len(child.text.strip().split()) if isinstance(child, bs4.element.Tag): child_info = get_node_info(child, ctx.nodestats, no_insert=True) node_info.sum_stats(child_info) node_info.w2t_ratio = ratio(el_stats['word_cnt'], el_stats['tag_cnt']) node_info.t2w_ratio = ratio(el_stats['tag_cnt'], el_stats['word_cnt']) node_info.link_density = ratio(el_stats['text_in_links'], el_stats['word_cnt']) node_info.text_density = ratio(el_stats['word_cnt'], el_stats['num_wraped_lines']) node_info.stats_done = True
def _init_stats(ctx, soup): nodestats = ctx.nodestats layers = get_layers(soup) for layer in sorted(layers, reverse=True): for nodes in layers[layer]: for node in nodes: if isinstance(node, bs4.element.Tag): ctx.seen_tags.add(node.name) get_node_info(node, nodestats)
def get_direct_vector(ctx, prev_node, cur_node, next_node): dummy_node = get_node_info(get_dummy_node(ctx), ctx.nodestats, no_insert=True) if not prev_node: prev_node = dummy_node if not next_node: next_node = dummy_node _known_tags = sorted(list(ctx.seen_tags)) _node_class = -1 if cur_node.node_name != 'dbce': _cls = cur_node.node.attrs.get('class', []) if len(_cls): _cls.sort() _node_class = hash(" ".join(_cls)) vector = (_known_tags.index(prev_node.node_name), _known_tags.index(cur_node.node_name), _known_tags.index( next_node.node_name), _node_class, prev_node.link_density, prev_node.text_density, cur_node.link_density, cur_node.text_density, next_node.link_density, next_node.text_density, prev_node.stats['word_cnt'], cur_node.stats['word_cnt'], next_node.stats['word_cnt'], prev_node.stats['text_len'], cur_node.stats['text_len'], next_node.stats['text_len'], prev_node.stats['tag_cnt'], cur_node.stats['tag_cnt'], next_node.stats['tag_cnt'], ratio(cur_node.stats['text_len'], cur_node.stats['tag_cnt'])) return vector
def extract_content(soup, ctx): out = [] for node in filter_tag(soup.body.recursiveChildGenerator(), ['dbce']): if is_content(get_node_info(node, ctx.nodestats, no_insert=True)): for _ in textwrap.wrap(node.text.strip(), 70): out.append(_) return "\n".join(out)
def extend_nodes_class(node, ctx): if 'class' not in node.attrs: node.attrs['class'] = [] node_info = get_node_info(node, ctx.nodestats, no_insert=True) _insert_class_value(node, node_info.dbce_class) _insert_class_value(node, 'dbce-marker') assert None not in getattr(node.attrs, 'class', [])
def classify_elements_stage3(elements, ctx, clf=None, testname=None): _classes = [] # all nodes share the same parent if len(elements) < 1: logger.warning("not enough elements...") return parent = get_node_info(elements[0].parent, ctx.nodestats, no_insert=True) _vectors = [] for (prev_node, cur_node, next_node) in iterate_over_elements(elements, ctx): # logger.debug("# PARENT dbce link/text density: {} ({})/ {}".format( # id(parent.node), # parent.node_name, # (parent.link_density, parent.text_density))) # logger.debug("# nodes : {} / {} ({}) / {}".format(prev_node.node_name, # cur_node.node_name, # id(cur_node), # next_node.node_name)) # logger.debug("# dbce classes: {} / {} / {}".format(prev_node.dbce_class, # cur_node.dbce_class, # next_node.dbce_class)) # logger.debug("# dbce link/text density: {} / {} / {}".format( # (prev_node.link_density, prev_node.text_density), # (cur_node.link_density, cur_node.text_density), # (next_node.link_density, next_node.text_density))) # logger.debug("# " + "-"*20) _vec = get_vector(ctx, prev_node, cur_node, next_node, parent, testname=testname) # logging even preparation of the log message is to expensive if logger.level == logging.DEBUG: logger.debug(" vector : {}".format(_vec)) _vectors.append(_vec) _oracle = clf.predict(_vectors) if logger.level == logging.DEBUG: logger.debug(" oracle: {}".format(_oracle)) def r2class(result): if result == 1: return DBCE_CLASS_CONTENT else: return DBCE_CLASS_BOILER_PLATE re_classes = map(r2class, _oracle) return re_classes
def get_parent_environ(ctx, parent): if not parent: return None if parent.node_name in ['a']: return get_parent_environ( ctx, get_node_info(parent.parent, ctx.nodestats, no_insert=True)) prev_sib = None next_sib = None for el in filter_tag(parent.node.previousSiblingGenerator()): prev_sib = get_node_info(el, ctx.nodestats, no_insert=True) break for el in filter_tag(parent.node.nextSiblingGenerator()): next_sib = get_node_info(el, ctx.nodestats, no_insert=True) break return (prev_sib, next_sib)
def iterate_over_elements(elements, ctx): nodes_iter = iter(elements) dummy_node = get_dummy_node(ctx) prev_node = cur_node = next_node = dummy_node try: cur_node = next(nodes_iter) except StopIteration: pass try: next_node = next(nodes_iter) except StopIteration: pass yield (get_node_info(prev_node, ctx.nodestats, no_insert=True), get_node_info(cur_node, ctx.nodestats, no_insert=True), get_node_info(next_node, ctx.nodestats, no_insert=True)) if next_node != dummy_node: stop = False while not stop: prev_node = cur_node cur_node = next_node try: next_node = next(nodes_iter) except StopIteration: next_node = dummy_node stop = True yield (get_node_info(prev_node, ctx.nodestats, no_insert=True), get_node_info(cur_node, ctx.nodestats, no_insert=True), get_node_info(next_node, ctx.nodestats, no_insert=True))
def extend_nodes_info(ctx, element): node_info = get_node_info(element, ctx.nodestats, no_insert=True) nd_stat = node_info.stats node_info.add_html_info['dbce-class'] = node_info.dbce_class node_info.add_html_info['dbce-wc'] = nd_stat['word_cnt'] node_info.add_html_info['dbce-tc'] = nd_stat['tag_cnt'] node_info.add_html_info['dbce-hrefs'] = nd_stat['href_cnt'] node_info.add_html_info['dbce-w2t'] = node_info.w2t_ratio node_info.add_html_info['dbce-t2w'] = node_info.t2w_ratio node_info.add_html_info['dbce-lden'] = node_info.link_density node_info.add_html_info['dbce-tden'] = node_info.text_density node_info.add_html_info['dbce-id'] = node_info.nodeid if node_info.preprocess_class: node_info.add_html_info['dbce-pp-class'] = element.preprocess_class
def set_nodes_class(node, cls, ctx, force=False): node_info = get_node_info(node, ctx.nodestats, no_insert=True) if node_info.dbce_class is not None and node_info.dbce_class != DBCE_CLASS_UNKNWON: logger.debug( "# you want me to change <{}> class? ({}) {} to {}".format( node_info.node_name, node_info.nodeid, node_info.dbce_class, cls)) if node_info.dbce_class != cls and not force: raise ValueError("class change {} -> {} not allowed".format( node_info.dbce_class, cls)) else: logger.debug("# you want me to set <{}> class? ({}) {} to {}".format( node_info.node_name, node_info.nodeid, node_info.dbce_class, cls)) node_info.dbce_class = cls node_stats(node, ctx)
def collect_vectors(ctx, soup): vectors = OrderedDict() layers = get_layers(soup) for level in sorted(layers.keys(), reverse=True): for nodes in layers[level]: _elements = list(filter_tag(nodes)) if len(_elements) == 0: continue # all elements have the same parent so let's get it only once # bs4 is awful slow __getattr__.find() ... parent = get_node_info(_elements[0].parent, ctx.nodestats, no_insert=True) get_xpath(parent.node, ctx) for (prev_node, cur_node, next_node) in iterate_over_elements(_elements, ctx): get_xpath(cur_node.node, ctx) if not (prev_node.dbce_class in [DBCE_CLASS_CONTENT, DBCE_CLASS_BOILER_PLATE] \ or cur_node.dbce_class in [DBCE_CLASS_CONTENT, DBCE_CLASS_BOILER_PLATE] \ or next_node.dbce_class in [DBCE_CLASS_CONTENT, DBCE_CLASS_BOILER_PLATE]): continue cur_node.clf_vector = get_vector(ctx, prev_node, cur_node, next_node, parent) _ = vectors.get(cur_node.clf_vector, False) if is_content(cur_node): if _ and _ != 1: logger.error("vector conflict bp: {}".format( cur_node.clf_vector)) get_parent_environ(ctx, parent) del vectors[cur_node.clf_vector] continue vectors[cur_node.clf_vector] = (cur_node.xpath, 1) else: if _ and _ != 0: logger.error("vector conflict content: {}".format( cur_node.clf_vector)) get_parent_environ(ctx, parent) del vectors[cur_node.clf_vector] continue vectors[cur_node.clf_vector] = (cur_node.xpath, 0) return vectors
def wrap_with_div(soup, ctx, versions=None, clean_markup=False, use_dbce=True): """ Retruns BeautifulSoup object in which the <dbce></dbce> tags has been wrapped with <div><dbce></dbce></div>. If clean_markup is True then <dbce></dbce> tags will be removed""" if versions: _new_tag = soup.new_tag('dbce-info') _new_tag['class'] = [] _bits = [_ for _ in map(str, versions)] _insert_class_value(_new_tag, "dbce-bits-{}".format(''.join(_bits))) soup.body.insert(0, _new_tag) for todo in soup.find_all(DBCE_MARKUP_ANALYSIS_TAG): todo_info = get_node_info(todo, ctx.nodestats, no_insert=True) _new_tag = soup.new_tag(DBCE_MARKUP_PRESENTATION_TAG) _new_tag['class'] = [] _node_bits = marker_to_bits(todo['class']) if use_dbce: _content_class = DBCE_CLASS_UNKNWON if versions: _content_class = classify_block(versions, _node_bits) if _content_class == DBCE_CLASS_UNKNWON: _copy_class_values(todo, _new_tag) else: if todo_info.dbce_class is not None: _content_class = todo_info.dbce_class _insert_class_value(_new_tag, 'dbce-no-bits') else: raise ValueError( "{}.dbce_class should be defined at this stage".format( todo_info.nodeid)) _insert_class_value(_new_tag, _content_class) _insert_class_value(_new_tag, 'dbce-marker') _insert_class_value( _new_tag, "dbce-bits-{}".format(''.join(map(str, _node_bits)))) todo.wrap(_new_tag) if clean_markup: todo.unwrap() return soup
def stage2_step_classify(elements, ctx, testname, versions=None): """This function iterates over all elements of one group (same parent) and classifies <dbce> nodes based on the bit patterns. """ _elements = list(filter_tag(elements)) invalid_as_bp = True if testname in ['T002'] else False for element in _elements: if is_tag(element, [DBCE_MARKUP_ANALYSIS_TAG]): _class = classify_block( versions, marker_to_bits(element['class']), ) if invalid_as_bp and _class == DBCE_CLASS_INVALID: _class = DBCE_CLASS_BOILER_PLATE el_info = get_node_info(element, ctx.nodestats) get_xpath(element, ctx) set_nodes_class(element, _class, ctx) if el_info.preprocess_class is None: el_info.preprocess_class = _class else: raise ValueError preprocess_fix_child(element, ctx)
def get_vector(ctx, prev_node, cur_node, next_node, parent=None, testname=None): (p_prev, p_next) = get_parent_environ(ctx, parent) level0_vect = get_direct_vector(ctx, prev_node, cur_node, next_node) _ret_val = level0_vect if ctx.testname in [None, 'T004', 'T006']: level1_vect = get_direct_vector(ctx, p_prev, parent, p_next) _ret_val += level1_vect if ctx.testname not in [None, 'T005', 'T006']: return _ret_val if ctx.testname in [None, 'T005', 'T006']: (p_prev, p_next) = get_parent_environ( ctx, get_node_info(parent.parent, ctx.nodestats, no_insert=True)) level2_vect = get_direct_vector(ctx, p_prev, parent, p_next) _ret_val += level2_vect return _ret_val
def stage2_step_reclassify(elements, ctx, clf=None, testname=None): _elements = list(filter_tag(elements)) _elements.reverse() if len(_elements) < 1: return (_classes, _classes_cnt) = get_class_stats(_elements, ctx) _all_class_eq = is_list_equal(_classes) _el0_info = get_node_info(_elements[0], ctx.nodestats, no_insert=True) _el0_parent_info = get_node_info(_el0_info.parent, ctx.nodestats, no_insert=True) # FIXME: check if this is needed ... was copy&pasted from stage2 #if _all_class_eq and _el0_parent_info.dbce_class == _classes[0] \ # and _classes[0] != DBCE_CLASS_UNKNWON: # return # we have the same parent for all elements parent = _el0_parent_info.node logger.debug("# stage #3: classes: {} / {} / {}".format( _classes, _classes_cnt, _all_class_eq)) # logger.debug("# stage #3: distribution: {}".format(percent_by_class)) tmp_classes_lr = classify_elements_stage3(_elements, ctx, clf, testname=testname) changes = False for (node, cls) in zip(_elements, tmp_classes_lr): node_info = get_node_info(node, ctx.nodestats, no_insert=True) if cls is None: continue elif cls != node_info.dbce_class: # FIXME: define a testcase for this one or remove it # jusTex method logger.debug("# stage #3: reclassify id: {}".format( node_info.nodeid)) if cls == DBCE_CLASS_BOILER_PLATE and node_info.parent.name in [ 'h1', 'h2', 'h3', 'h4' ]: logger.error("take a look on this one - node_name is hX") #set_nodes_class(node, DBCE_CLASS_UNKNWON, ctx, force=True) else: # FIXME: make the change based on score?! set_nodes_class(node, cls, ctx, force=True) changes = True else: pass if not changes: return (_classes, _classes_cnt) = get_class_stats(_elements, ctx) _all_class_eq = is_list_equal(_classes) logger.debug("# stage #3: RE classes: {} / {} / {}".format( _classes, _classes_cnt, _all_class_eq)) if _all_class_eq: logger.debug("# stage #3: RE all classes are equal... set up parent") set_nodes_class(parent, get_node_info(_elements[0], ctx.nodestats, no_insert=True).dbce_class, ctx, force=True) else: # FIXME: this should be extended... logger.debug("# stage #3: just skip it")
def get_dummy_node(ctx): get_node_info(DUMMY_NODE, ctx.nodestats) return DUMMY_NODE
def preprocess_fix_child(element, ctx): el_class = get_node_info(element, ctx.nodestats, no_insert=True).dbce_class for child in list(filter_tag(element.children)): if get_node_info(child, ctx.nodestats, no_insert=True) is None: set_nodes_class(child, el_class, ctx)
def get_class_stats(elements, ctx): _classes = list([ get_node_info(_, ctx.nodestats, no_insert=True).dbce_class for _ in elements ]) return (_classes, Counter(_classes))