Example #1
0
    def test_get_text_context(self):
        encoding = cc.get_encoding(page_test2)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test2, parser).getroottree()
        eu.clean_tree(tree)

        tt = tree.xpath("//tt[1]")[0]
        text_context = eu.get_text_context(tt)
        self.assertEqual("Hello World foobar. This is nice. Yo.", text_context)
Example #2
0
    def test_get_text_context(self):
        encoding = cc.get_encoding(page_test2)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test2, parser).getroottree()
        eu.clean_tree(tree)

        tt = tree.xpath('//tt[1]')[0]
        text_context = eu.get_text_context(tt)
        self.assertEqual('Hello World foobar. This is nice. Yo.', text_context)
Example #3
0
    def test_word_count(self):
        encoding = cc.get_encoding(page_test)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test, parser).getroottree()
        eu.clean_tree(tree)

        h1 = eu.SingleXPath("//h1[1]")
        h1_element = h1.get_element(tree)
        wc = eu.get_word_count(h1.get_element_as_list(h1_element))
        print(h1.get_text(h1_element))
        self.assertEqual(6, wc)

        body = eu.SingleXPath("//body[1]")
        body_element = body.get_element(tree)
        wc = eu.get_word_count(body.get_element_as_list(body_element))
        print(body.get_text(body_element))
        self.assertEqual(11, wc)
Example #4
0
    def test_word_count(self):
        encoding = cc.get_encoding(page_test)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test, parser).getroottree()
        eu.clean_tree(tree)

        h1 = eu.SingleXPath('//h1[1]')
        h1_element = h1.get_element(tree)
        wc = eu.get_word_count(h1.get_element_as_list(h1_element))
        print(h1.get_text(h1_element))
        self.assertEqual(6, wc)

        body = eu.SingleXPath('//body[1]')
        body_element = body.get_element(tree)
        wc = eu.get_word_count(body.get_element_as_list(body_element))
        print(body.get_text(body_element))
        self.assertEqual(11, wc)
Example #5
0
    def test_get_sentence(self):
        encoding = cc.get_encoding(page_test2)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test2, parser).getroottree()
        eu.clean_tree(tree)

        tt = tree.xpath("//tt[1]")[0]
        text_context = eu.get_text_context(tt)
        sentence = eu.get_sentence(tt, "foobar", text_context)
        self.assertEqual("Hello World foobar.", sentence)

        # Test when there are more than one match!
        code = tree.xpath("//code[2]")[0]
        text_context = eu.get_text_context(code)
        sentence = eu.get_sentence(code, "foo", text_context)
        self.assertEqual("This is foo.", sentence)

        # Test when there are more than one match, but wrong markup (sorry...)
        b = tree.xpath("//b[1]")[0]
        text_context = eu.get_text_context(b)
        sentence = eu.get_sentence(b, "foo", text_context)
        self.assertEqual("Hello World foo.", sentence)
Example #6
0
    def test_get_sentence(self):
        encoding = cc.get_encoding(page_test2)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test2, parser).getroottree()
        eu.clean_tree(tree)

        tt = tree.xpath('//tt[1]')[0]
        text_context = eu.get_text_context(tt)
        sentence = eu.get_sentence(tt, 'foobar', text_context)
        self.assertEqual('Hello World foobar.', sentence)

        # Test when there are more than one match!
        code = tree.xpath('//code[2]')[0]
        text_context = eu.get_text_context(code)
        sentence = eu.get_sentence(code, 'foo', text_context)
        self.assertEqual('This is foo.', sentence)

        # Test when there are more than one match, but wrong markup (sorry...)
        b = tree.xpath('//b[1]')[0]
        text_context = eu.get_text_context(b)
        sentence = eu.get_sentence(b, 'foo', text_context)
        self.assertEqual('Hello World foo.', sentence)
Example #7
0
    def _process_page(self, page, load):
        page_path = os.path.join(settings.PROJECT_FS_ROOT, page.file_path)
        page_file = open(page_path)
        content = page_file.read()
        page_file.close()
        encoding = get_encoding(content)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        load.tree = etree.fromstring(content, parser).getroottree()
        clean_tree(load.tree)

        page.title = self._process_page_title(page, load)

        body = self.xbody.get_element(load.tree)
        body_elements = self.xbody.get_element_as_list(body)
        page.word_count = get_word_count(body_elements)
        page.xpath = load.tree.getpath(body)
        page.save()

        self._process_init_page(page, load)
        check = self._check_parser(page, load)
        if not check:
            return
        self._process_sections(page, load)
Example #8
0
    def _process_page(self, page, load):
        page_path = os.path.join(settings.PROJECT_FS_ROOT, page.file_path)
        page_file = open(page_path)
        content = page_file.read()
        page_file.close()
        encoding = get_encoding(content)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        load.tree = etree.fromstring(content, parser).getroottree()
        clean_tree(load.tree)

        page.title = self._process_page_title(page, load)

        body = self.xbody.get_element(load.tree)
        body_elements = self.xbody.get_element_as_list(body)
        page.word_count = get_word_count(body_elements)
        page.xpath = load.tree.getpath(body)
        page.save()

        self._process_init_page(page, load)
        check = self._check_parser(page, load)
        if not check:
            return
        self._process_sections(page, load)