コード例 #1
0
ファイル: test_models.py プロジェクト: seomoz/dragnet
    def test_models(self):
        models = [kohlschuetter_model,
                  weninger_model,
                  kohlschuetter_weninger_model,
                  kohlschuetter_css_model,
                  kohlschuetter_css_weninger_model,
                  content_extractor,
                  content_comments_extractor]

        actual_content = json.load(open(
            os.path.join(FIXTURES, 'models_content.json'), 'r'))

        for k in range_(len(models)):
            # some of the models (weninger) aren't deterministic
            # so the content doesn't match exactly every time,
            # although it passes most of the time
            # we allow a max of 5 failures before failing the entire test
            m = models[k]
            passed = False
            for i in range_(5):
                content = m.analyze(self._html)
                if actual_content[k].encode('utf-8') == content:
                    passed = True
                    break
            self.assertTrue(passed)
コード例 #2
0
ファイル: test_models.py プロジェクト: seomoz/dragnet
    def test_content_and_content_comments_extractor_blocks(self):
        '''
        The content and content/comments extractor should return proper blocks
        '''
        content = content_extractor.analyze(self._html, blocks=True)
        content_comments = content_comments_extractor.analyze(
            self._html, blocks=True)

        passed_content = False
        passed_content_comments = False
        for i in range_(5):
            actual_content, actual_content_comments = \
                content_and_content_comments_extractor.analyze(
                    self._html, blocks=True)
            passed_content = (
                [blk.text for blk in actual_content] ==
                [blk.text for blk in content]
                )
            passed_content_comments = (
                [blk.text for blk in actual_content_comments] ==
                [blk.text for blk in content_comments]
                )
            if passed_content and passed_content_comments:
                break

        self.assertTrue(passed_content)
        self.assertTrue(passed_content_comments)
コード例 #3
0
def block_output_tokens(blocks, true_tokens):
    """
    blocks = the output from blockify
    true_tokens = a list of true tokens
    """
    assert len(blocks) == len(true_tokens)
    for k in range_(len(blocks)):
        block_tokens = re.split(r"\s+", blocks[k].text.strip())
        assert block_tokens == true_tokens[k]
コード例 #4
0
ファイル: test_kohlschuetter.py プロジェクト: thiseye/dragnet
    def block_output_tokens(self, blocks, true_tokens):
        """blocks = the output from blockify
           true_tokens = a list of true tokens"""

        self.assertTrue(len(blocks) == len(true_tokens))

        for k in range_(len(blocks)):
            block_tokens = re.split('\s+', blocks[k].text.strip())
            self.assertEqual(block_tokens, true_tokens[k])
コード例 #5
0
    def test_models(self):
        models = [content_extractor, content_comments_extractor]

        actual_content = json.load(
            open(os.path.join(FIXTURES, 'models_content_mod.json'), 'r'))

        for k in range_(len(models)):
            # some of the models (weninger) aren't deterministic
            # so the content doesn't match exactly every time,
            # although it passes most of the time
            # we allow a max of 5 failures before failing the entire test
            m = models[k]
            gold_standard = actual_content[k].encode('utf-8')
            passed = False
            for i in range_(10):
                content = m.analyze(self._html)
                _, _, f1 = evaluation_metrics(simple_tokenizer(gold_standard),
                                              simple_tokenizer(content))
                if f1 >= 0.98:
                    passed = True
                    break
            self.assertTrue(passed)
コード例 #6
0
ファイル: test_models.py プロジェクト: seomoz/dragnet
    def test_content_and_content_comments_extractor(self):
        content = content_extractor.analyze(self._html)
        content_comments = content_comments_extractor.analyze(self._html)

        passed_content = False
        passed_content_comments = False
        for i in range_(5):
            actual_content, actual_content_comments = \
                content_and_content_comments_extractor.analyze(self._html)
            passed_content = actual_content == content
            passed_content_comments = (
                actual_content_comments == content_comments)
            if passed_content and passed_content_comments:
                break

        self.assertTrue(passed_content)
        self.assertTrue(passed_content_comments)
コード例 #7
0
    def test_content_and_content_comments_extractor(self):
        content = content_extractor.analyze(self._html)
        content_comments = content_comments_extractor.analyze(self._html)

        passed_content = False
        passed_content_comments = False
        for i in range_(10):
            actual_content, actual_content_comments = \
                content_and_content_comments_extractor.analyze(self._html)
            passed_content = actual_content == content
            passed_content_comments = (
                actual_content_comments == content_comments)
            if passed_content and passed_content_comments:
                break

        self.assertTrue(passed_content)
        self.assertTrue(passed_content_comments)
コード例 #8
0
    def test_content_and_content_comments_extractor_blocks(self):
        '''
        The content and content/comments extractor should return proper blocks
        '''
        content = content_extractor.analyze(self._html, blocks=True)
        content_comments = content_comments_extractor.analyze(self._html,
                                                              blocks=True)

        passed_content = False
        passed_content_comments = False
        for i in range_(5):
            actual_content, actual_content_comments = \
                content_and_content_comments_extractor.analyze(
                    self._html, blocks=True)
            passed_content = ([blk.text for blk in actual_content
                               ] == [blk.text for blk in content])
            passed_content_comments = ([
                blk.text for blk in actual_content_comments
            ] == [blk.text for blk in content_comments])
            if passed_content and passed_content_comments:
                break

        self.assertTrue(passed_content)
        self.assertTrue(passed_content_comments)
コード例 #9
0
ファイル: test_kohlschuetter.py プロジェクト: thiseye/dragnet
 def css_output_tokens(self, blocks, attrib, true_tokens):
     self.assertEqual(len(blocks), len(true_tokens))
     for k in range_(len(blocks)):
         css_tokens = re.split('\s+', blocks[k].css[attrib].strip())
         self.assertEqual(css_tokens, true_tokens[k])
コード例 #10
0
ファイル: test_kohlschuetter.py プロジェクト: thiseye/dragnet
    def link_output_tokens(self, blocks, true_tokens):
        self.assertTrue(len(blocks) == len(true_tokens))

        link_tokens = [ele.link_tokens for ele in blocks]
        for k in range_(len(link_tokens)):
            self.assertEqual(link_tokens[k], true_tokens[k])
コード例 #11
0
def css_output_tokens(blocks, attrib, true_tokens):
    assert len(blocks) == len(true_tokens)
    for k in range_(len(blocks)):
        css_tokens = re.split(r"\s+", blocks[k].css[attrib].strip())
        assert css_tokens == true_tokens[k]
コード例 #12
0
def link_output_tokens(blocks, true_tokens):
    assert len(blocks) == len(true_tokens)
    link_tokens = [ele.link_tokens for ele in blocks]
    for k in range_(len(link_tokens)):
        assert link_tokens[k] == true_tokens[k]