Beispiel #1
0
    def test_classify_cur_up_down_prev_combo(self):
        versions = (1, 1, 1, 1, 0)

        bits = (1, 0, 0, 0, 0)
        self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits))

        bits = (1, 0, 0, 1, 0)
        self.assertEqual(DBCE_CLASS_CONTENT, classify_block(versions, bits))

        bits = (1, 0, 1, 0, 0)
        self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits))

        bits = (1, 0, 1, 1, 0)
        self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits))

        bits = (1, 1, 0, 0, 0)
        self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits))

        bits = (1, 1, 0, 1, 0)
        self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits))

        bits = (1, 1, 1, 0, 0)
        self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits))

        bits = (1, 1, 1, 1, 0)
        self.assertEqual(DBCE_CLASS_BOILER_PLATE, classify_block(versions, bits))
Beispiel #2
0
def _sample_data_set(ctx, versions, all_data, manual_ann_data, manual_ann_cnt):
    # try to split elements evenly
    # content
    bin_class_1 = []
    # bp
    bin_class_2 = []
    # invalid
    bin_class_3 = []
    _vectors = []
    for node in all_data:
        _xpath = get_xpath(node, ctx)
        if _xpath in manual_ann_data:
            continue
        _class = classify_block(versions, marker_to_bits(node['class']))
        if _class == DBCE_CLASS_CONTENT:
            bin_class_1.append(node)
        elif _class == DBCE_CLASS_BOILER_PLATE:
            bin_class_2.append(node)
        elif _class == DBCE_CLASS_INVALID:
            bin_class_3.append(node)
        else:
            raise ValueError("unknown class: {}".format(_class))

    logger.info("got {} / {} / {} elements per class".format(
        len(bin_class_1), len(bin_class_2), len(bin_class_3)))

    n_sample = 600
    test_data = []

    diff = manual_ann_cnt[DBCE_CLASS_CONTENT] - n_sample
    if diff < 0:
        test_data += random.sample(bin_class_1, min(len(bin_class_1),
                                                    abs(diff)))

    diff = manual_ann_cnt[DBCE_CLASS_BOILER_PLATE] - n_sample
    if diff < 0:
        test_data += random.sample(bin_class_2, min(len(bin_class_2),
                                                    abs(diff)))

    diff = manual_ann_cnt[DBCE_CLASS_INVALID] - n_sample
    if diff < 0:
        test_data += random.sample(bin_class_3, min(len(bin_class_3),
                                                    abs(diff)))
    # FIXME: remove this change
    test_data += bin_class_1 + bin_class_2 + bin_class_3
    logger.info("Test data: {}".format(len(test_data)))
    return test_data
Beispiel #3
0
def wrap_with_div(soup, ctx, versions=None, clean_markup=False, use_dbce=True):
    """ Retruns BeautifulSoup object in which the <dbce></dbce> tags
    has been wrapped with <div><dbce></dbce></div>. If clean_markup is True
    then <dbce></dbce> tags will be removed"""

    if versions:
        _new_tag = soup.new_tag('dbce-info')
        _new_tag['class'] = []
        _bits = [_ for _ in map(str, versions)]
        _insert_class_value(_new_tag, "dbce-bits-{}".format(''.join(_bits)))
        soup.body.insert(0, _new_tag)

    for todo in soup.find_all(DBCE_MARKUP_ANALYSIS_TAG):
        todo_info = get_node_info(todo, ctx.nodestats, no_insert=True)

        _new_tag = soup.new_tag(DBCE_MARKUP_PRESENTATION_TAG)
        _new_tag['class'] = []
        _node_bits = marker_to_bits(todo['class'])
        if use_dbce:
            _content_class = DBCE_CLASS_UNKNWON
            if versions:
                _content_class = classify_block(versions, _node_bits)
            if _content_class == DBCE_CLASS_UNKNWON:
                _copy_class_values(todo, _new_tag)
        else:
            if todo_info.dbce_class is not None:
                _content_class = todo_info.dbce_class
                _insert_class_value(_new_tag, 'dbce-no-bits')
            else:
                raise ValueError(
                    "{}.dbce_class should be defined at this stage".format(
                        todo_info.nodeid))

        _insert_class_value(_new_tag, _content_class)
        _insert_class_value(_new_tag, 'dbce-marker')

        _insert_class_value(
            _new_tag, "dbce-bits-{}".format(''.join(map(str, _node_bits))))
        todo.wrap(_new_tag)

        if clean_markup:
            todo.unwrap()

    return soup
Beispiel #4
0
def stage2_step_classify(elements, ctx, testname, versions=None):
    """This function iterates over all elements of one group (same parent)
    and classifies <dbce> nodes based on the bit patterns.
    """
    _elements = list(filter_tag(elements))
    invalid_as_bp = True if testname in ['T002'] else False
    for element in _elements:
        if is_tag(element, [DBCE_MARKUP_ANALYSIS_TAG]):
            _class = classify_block(
                versions,
                marker_to_bits(element['class']),
            )
            if invalid_as_bp and _class == DBCE_CLASS_INVALID:
                _class = DBCE_CLASS_BOILER_PLATE
            el_info = get_node_info(element, ctx.nodestats)
            get_xpath(element, ctx)
            set_nodes_class(element, _class, ctx)
            if el_info.preprocess_class is None:
                el_info.preprocess_class = _class
            else:
                raise ValueError
            preprocess_fix_child(element, ctx)
Beispiel #5
0
 def test_classify_cur_up_next_11001(self):
     versions = (1, 1, 0, 0, 1)
     bits = (1, 1, 0, 0, 1)
     self.assertEqual(DBCE_CLASS_BOILER_PLATE, classify_block(versions, bits))
Beispiel #6
0
 def test_classify_cur_up_next_10001(self):
     versions = (1, 1, 0, 0, 1)
     bits = (1, 0, 0, 0, 1)
     self.assertEqual(DBCE_CLASS_CONTENT, classify_block(versions, bits))
Beispiel #7
0
 def test_classify_cur_up_down_10100(self):
     versions = (1, 1, 1, 0, 0)
     bits = (1, 0, 1, 0, 0)
     self.assertEqual(DBCE_CLASS_BOILER_PLATE, classify_block(versions, bits))
Beispiel #8
0
 def test_classify_cur_up_11000(self):
     versions = (1, 1, 0, 0, 0)
     # element is in `cur` and `up`
     bits = (1, 1, 0, 0, 0)
     self.assertEqual(DBCE_CLASS_BOILER_PLATE, classify_block(versions, bits))
Beispiel #9
0
 def test_classify_cur_up_10000(self):
     versions = (1, 1, 0, 0, 0)
     # element is only in `cur`
     bits = (1, 0, 0, 0, 0)
     self.assertEqual(DBCE_CLASS_CONTENT, classify_block(versions, bits))
Beispiel #10
0
 def test_classify_cur_up_00000(self):
     versions = (1, 1, 0, 0, 0)
     # element not found in both versions
     bits = (0, 0, 0, 0, 0)
     with self.assertRaises(ValueError):
         self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits))
Beispiel #11
0
 def test_classify_cur_up_down_next_11100(self):
     versions = (1, 1, 1, 0, 1)
     bits = (1, 1, 1, 0, 0)
     self.assertEqual(DBCE_CLASS_INVALID, classify_block(versions, bits))