def setUpClass(self): self.test_doc = os.path.join(settings.TESTDATA, "httpclient402doc", "connmgmt.html") page = open(self.test_doc) content = page.read() page.close() encoding = cc.get_encoding(content) self.parser = etree.HTMLParser(remove_comments=True, encoding=encoding) self.tree = etree.fromstring(content, self.parser).getroottree()
def test_encoding(self): url = "http://www.infobart.com/index.php/about/" file_from = cc.get_file_from(url) content = file_from.read() encoding = cc.get_encoding(content) self.assertEqual(encoding, "utf-8") self.assertTrue(len(content) > 0) file_from.close()
def setUpClass(self): self.test_doc = os.path.join(settings.TESTDATA, 'httpclient402doc', 'connmgmt.html') page = open(self.test_doc) content = page.read() page.close() encoding = cc.get_encoding(content) self.parser = etree.HTMLParser(remove_comments=True, encoding=encoding) self.tree = etree.fromstring(content, self.parser).getroottree()
def test_get_text_context(self): encoding = cc.get_encoding(page_test2) parser = etree.HTMLParser(remove_comments=True, encoding=encoding) tree = etree.fromstring(page_test2, parser).getroottree() eu.clean_tree(tree) tt = tree.xpath("//tt[1]")[0] text_context = eu.get_text_context(tt) self.assertEqual("Hello World foobar. This is nice. Yo.", text_context)
def test_encoding(self): url = 'http://www.infobart.com/index.php/about/' file_from = \ cc.get_file_from(url) content = file_from.read() encoding = cc.get_encoding(content) self.assertEqual(encoding, 'utf-8') self.assertTrue(len(content) > 0) file_from.close()
def test_get_text_context(self): encoding = cc.get_encoding(page_test2) parser = etree.HTMLParser(remove_comments=True, encoding=encoding) tree = etree.fromstring(page_test2, parser).getroottree() eu.clean_tree(tree) tt = tree.xpath('//tt[1]')[0] text_context = eu.get_text_context(tt) self.assertEqual('Hello World foobar. This is nice. Yo.', text_context)
def process_page(self, url): self.logger.info("Processing page: " + url) local_url = self.make_copy(get_url_without_hash(url)) local_page = urllib2.urlopen(local_url) content = local_page.read() local_page.close() parser = etree.HTMLParser(encoding=get_encoding(content)) tree = etree.fromstring(content, parser) links = self.process_page_links(tree, local_url, url) self.process_page_imgs(tree, url) page = DocumentPage(url, local_url, links) return page
def test_word_count(self): encoding = cc.get_encoding(page_test) parser = etree.HTMLParser(remove_comments=True, encoding=encoding) tree = etree.fromstring(page_test, parser).getroottree() eu.clean_tree(tree) h1 = eu.SingleXPath("//h1[1]") h1_element = h1.get_element(tree) wc = eu.get_word_count(h1.get_element_as_list(h1_element)) print(h1.get_text(h1_element)) self.assertEqual(6, wc) body = eu.SingleXPath("//body[1]") body_element = body.get_element(tree) wc = eu.get_word_count(body.get_element_as_list(body_element)) print(body.get_text(body_element)) self.assertEqual(11, wc)
def test_word_count(self): encoding = cc.get_encoding(page_test) parser = etree.HTMLParser(remove_comments=True, encoding=encoding) tree = etree.fromstring(page_test, parser).getroottree() eu.clean_tree(tree) h1 = eu.SingleXPath('//h1[1]') h1_element = h1.get_element(tree) wc = eu.get_word_count(h1.get_element_as_list(h1_element)) print(h1.get_text(h1_element)) self.assertEqual(6, wc) body = eu.SingleXPath('//body[1]') body_element = body.get_element(tree) wc = eu.get_word_count(body.get_element_as_list(body_element)) print(body.get_text(body_element)) self.assertEqual(11, wc)
def load_snippets(self): from_path = os.path.join(settings.TESTDATA, 'snippets') snippets = [] for i, path in enumerate(sorted(os.listdir(from_path))): if path.endswith('.java'): with open(os.path.join(from_path, path)) as f: text = f.read() encoding = get_encoding(text) content = unicode(text, encoding) snippet = CodeSnippet( index = i, project = self.project, snippet_text = content, language = 'j', source = 'd', ) snippet.save() snippets.append(snippet) return snippets
def load_snippets(self): from_path = os.path.join(settings.TESTDATA, 'snippets') snippets = [] for i, path in enumerate(sorted(os.listdir(from_path))): if path.endswith('.java'): with open(os.path.join(from_path, path)) as f: text = f.read() encoding = get_encoding(text) content = unicode(text, encoding) snippet = CodeSnippet( index=i, project=self.project, snippet_text=content, language='j', source='d', ) snippet.save() snippets.append(snippet) return snippets
def test_get_sentence(self): encoding = cc.get_encoding(page_test2) parser = etree.HTMLParser(remove_comments=True, encoding=encoding) tree = etree.fromstring(page_test2, parser).getroottree() eu.clean_tree(tree) tt = tree.xpath("//tt[1]")[0] text_context = eu.get_text_context(tt) sentence = eu.get_sentence(tt, "foobar", text_context) self.assertEqual("Hello World foobar.", sentence) # Test when there are more than one match! code = tree.xpath("//code[2]")[0] text_context = eu.get_text_context(code) sentence = eu.get_sentence(code, "foo", text_context) self.assertEqual("This is foo.", sentence) # Test when there are more than one match, but wrong markup (sorry...) b = tree.xpath("//b[1]")[0] text_context = eu.get_text_context(b) sentence = eu.get_sentence(b, "foo", text_context) self.assertEqual("Hello World foo.", sentence)
def test_get_sentence(self): encoding = cc.get_encoding(page_test2) parser = etree.HTMLParser(remove_comments=True, encoding=encoding) tree = etree.fromstring(page_test2, parser).getroottree() eu.clean_tree(tree) tt = tree.xpath('//tt[1]')[0] text_context = eu.get_text_context(tt) sentence = eu.get_sentence(tt, 'foobar', text_context) self.assertEqual('Hello World foobar.', sentence) # Test when there are more than one match! code = tree.xpath('//code[2]')[0] text_context = eu.get_text_context(code) sentence = eu.get_sentence(code, 'foo', text_context) self.assertEqual('This is foo.', sentence) # Test when there are more than one match, but wrong markup (sorry...) b = tree.xpath('//b[1]')[0] text_context = eu.get_text_context(b) sentence = eu.get_sentence(b, 'foo', text_context) self.assertEqual('Hello World foo.', sentence)
def _process_page(self, page, load): page_path = os.path.join(settings.PROJECT_FS_ROOT, page.file_path) page_file = open(page_path) content = page_file.read() page_file.close() encoding = get_encoding(content) parser = etree.HTMLParser(remove_comments=True, encoding=encoding) load.tree = etree.fromstring(content, parser).getroottree() clean_tree(load.tree) page.title = self._process_page_title(page, load) body = self.xbody.get_element(load.tree) body_elements = self.xbody.get_element_as_list(body) page.word_count = get_word_count(body_elements) page.xpath = load.tree.getpath(body) page.save() self._process_init_page(page, load) check = self._check_parser(page, load) if not check: return self._process_sections(page, load)