Esempio n. 1
0
    def test_word_count(self):
        encoding = cc.get_encoding(page_test)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test, parser).getroottree()
        eu.clean_tree(tree)

        h1 = eu.SingleXPath("//h1[1]")
        h1_element = h1.get_element(tree)
        wc = eu.get_word_count(h1.get_element_as_list(h1_element))
        print(h1.get_text(h1_element))
        self.assertEqual(6, wc)

        body = eu.SingleXPath("//body[1]")
        body_element = body.get_element(tree)
        wc = eu.get_word_count(body.get_element_as_list(body_element))
        print(body.get_text(body_element))
        self.assertEqual(11, wc)
Esempio n. 2
0
    def test_word_count(self):
        encoding = cc.get_encoding(page_test)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        tree = etree.fromstring(page_test, parser).getroottree()
        eu.clean_tree(tree)

        h1 = eu.SingleXPath('//h1[1]')
        h1_element = h1.get_element(tree)
        wc = eu.get_word_count(h1.get_element_as_list(h1_element))
        print(h1.get_text(h1_element))
        self.assertEqual(6, wc)

        body = eu.SingleXPath('//body[1]')
        body_element = body.get_element(tree)
        wc = eu.get_word_count(body.get_element_as_list(body_element))
        print(body.get_text(body_element))
        self.assertEqual(11, wc)
Esempio n. 3
0
    def _process_page(self, page, load):
        page_path = os.path.join(settings.PROJECT_FS_ROOT, page.file_path)
        page_file = open(page_path)
        content = page_file.read()
        page_file.close()
        encoding = get_encoding(content)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        load.tree = etree.fromstring(content, parser).getroottree()
        clean_tree(load.tree)

        page.title = self._process_page_title(page, load)

        body = self.xbody.get_element(load.tree)
        body_elements = self.xbody.get_element_as_list(body)
        page.word_count = get_word_count(body_elements)
        page.xpath = load.tree.getpath(body)
        page.save()

        self._process_init_page(page, load)
        check = self._check_parser(page, load)
        if not check:
            return
        self._process_sections(page, load)
Esempio n. 4
0
    def _process_page(self, page, load):
        page_path = os.path.join(settings.PROJECT_FS_ROOT, page.file_path)
        page_file = open(page_path)
        content = page_file.read()
        page_file.close()
        encoding = get_encoding(content)
        parser = etree.HTMLParser(remove_comments=True, encoding=encoding)
        load.tree = etree.fromstring(content, parser).getroottree()
        clean_tree(load.tree)

        page.title = self._process_page_title(page, load)

        body = self.xbody.get_element(load.tree)
        body_elements = self.xbody.get_element_as_list(body)
        page.word_count = get_word_count(body_elements)
        page.xpath = load.tree.getpath(body)
        page.save()

        self._process_init_page(page, load)
        check = self._check_parser(page, load)
        if not check:
            return
        self._process_sections(page, load)