def test_classify_cur_up_down_prev_combo(self): versions = (1, 1, 1, 1, 0) bits = (1, 0, 0, 0, 0) self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits)) bits = (1, 0, 0, 1, 0) self.assertEqual(DBCE_CLASS_CONTENT, classify_block(versions, bits)) bits = (1, 0, 1, 0, 0) self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits)) bits = (1, 0, 1, 1, 0) self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits)) bits = (1, 1, 0, 0, 0) self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits)) bits = (1, 1, 0, 1, 0) self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits)) bits = (1, 1, 1, 0, 0) self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits)) bits = (1, 1, 1, 1, 0) self.assertEqual(DBCE_CLASS_BOILER_PLATE, classify_block(versions, bits))
def _sample_data_set(ctx, versions, all_data, manual_ann_data, manual_ann_cnt): # try to split elements evenly # content bin_class_1 = [] # bp bin_class_2 = [] # invalid bin_class_3 = [] _vectors = [] for node in all_data: _xpath = get_xpath(node, ctx) if _xpath in manual_ann_data: continue _class = classify_block(versions, marker_to_bits(node['class'])) if _class == DBCE_CLASS_CONTENT: bin_class_1.append(node) elif _class == DBCE_CLASS_BOILER_PLATE: bin_class_2.append(node) elif _class == DBCE_CLASS_INVALID: bin_class_3.append(node) else: raise ValueError("unknown class: {}".format(_class)) logger.info("got {} / {} / {} elements per class".format( len(bin_class_1), len(bin_class_2), len(bin_class_3))) n_sample = 600 test_data = [] diff = manual_ann_cnt[DBCE_CLASS_CONTENT] - n_sample if diff < 0: test_data += random.sample(bin_class_1, min(len(bin_class_1), abs(diff))) diff = manual_ann_cnt[DBCE_CLASS_BOILER_PLATE] - n_sample if diff < 0: test_data += random.sample(bin_class_2, min(len(bin_class_2), abs(diff))) diff = manual_ann_cnt[DBCE_CLASS_INVALID] - n_sample if diff < 0: test_data += random.sample(bin_class_3, min(len(bin_class_3), abs(diff))) # FIXME: remove this change test_data += bin_class_1 + bin_class_2 + bin_class_3 logger.info("Test data: {}".format(len(test_data))) return test_data
def wrap_with_div(soup, ctx, versions=None, clean_markup=False, use_dbce=True): """ Retruns BeautifulSoup object in which the <dbce></dbce> tags has been wrapped with <div><dbce></dbce></div>. If clean_markup is True then <dbce></dbce> tags will be removed""" if versions: _new_tag = soup.new_tag('dbce-info') _new_tag['class'] = [] _bits = [_ for _ in map(str, versions)] _insert_class_value(_new_tag, "dbce-bits-{}".format(''.join(_bits))) soup.body.insert(0, _new_tag) for todo in soup.find_all(DBCE_MARKUP_ANALYSIS_TAG): todo_info = get_node_info(todo, ctx.nodestats, no_insert=True) _new_tag = soup.new_tag(DBCE_MARKUP_PRESENTATION_TAG) _new_tag['class'] = [] _node_bits = marker_to_bits(todo['class']) if use_dbce: _content_class = DBCE_CLASS_UNKNWON if versions: _content_class = classify_block(versions, _node_bits) if _content_class == DBCE_CLASS_UNKNWON: _copy_class_values(todo, _new_tag) else: if todo_info.dbce_class is not None: _content_class = todo_info.dbce_class _insert_class_value(_new_tag, 'dbce-no-bits') else: raise ValueError( "{}.dbce_class should be defined at this stage".format( todo_info.nodeid)) _insert_class_value(_new_tag, _content_class) _insert_class_value(_new_tag, 'dbce-marker') _insert_class_value( _new_tag, "dbce-bits-{}".format(''.join(map(str, _node_bits)))) todo.wrap(_new_tag) if clean_markup: todo.unwrap() return soup
def stage2_step_classify(elements, ctx, testname, versions=None): """This function iterates over all elements of one group (same parent) and classifies <dbce> nodes based on the bit patterns. """ _elements = list(filter_tag(elements)) invalid_as_bp = True if testname in ['T002'] else False for element in _elements: if is_tag(element, [DBCE_MARKUP_ANALYSIS_TAG]): _class = classify_block( versions, marker_to_bits(element['class']), ) if invalid_as_bp and _class == DBCE_CLASS_INVALID: _class = DBCE_CLASS_BOILER_PLATE el_info = get_node_info(element, ctx.nodestats) get_xpath(element, ctx) set_nodes_class(element, _class, ctx) if el_info.preprocess_class is None: el_info.preprocess_class = _class else: raise ValueError preprocess_fix_child(element, ctx)
def test_classify_cur_up_next_11001(self): versions = (1, 1, 0, 0, 1) bits = (1, 1, 0, 0, 1) self.assertEqual(DBCE_CLASS_BOILER_PLATE, classify_block(versions, bits))
def test_classify_cur_up_next_10001(self): versions = (1, 1, 0, 0, 1) bits = (1, 0, 0, 0, 1) self.assertEqual(DBCE_CLASS_CONTENT, classify_block(versions, bits))
def test_classify_cur_up_down_10100(self): versions = (1, 1, 1, 0, 0) bits = (1, 0, 1, 0, 0) self.assertEqual(DBCE_CLASS_BOILER_PLATE, classify_block(versions, bits))
def test_classify_cur_up_11000(self): versions = (1, 1, 0, 0, 0) # element is in `cur` and `up` bits = (1, 1, 0, 0, 0) self.assertEqual(DBCE_CLASS_BOILER_PLATE, classify_block(versions, bits))
def test_classify_cur_up_10000(self): versions = (1, 1, 0, 0, 0) # element is only in `cur` bits = (1, 0, 0, 0, 0) self.assertEqual(DBCE_CLASS_CONTENT, classify_block(versions, bits))
def test_classify_cur_up_00000(self): versions = (1, 1, 0, 0, 0) # element not found in both versions bits = (0, 0, 0, 0, 0) with self.assertRaises(ValueError): self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits))
def test_classify_cur_up_down_next_11100(self): versions = (1, 1, 1, 0, 1) bits = (1, 1, 1, 0, 0) self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits))