def test_title_attribute(self): self.assertContainsSameWords( html_to_plaintext('<p title="title">content</p>'), 'title content') self.assertContainsSameWords( html_to_plaintext( '<p title="title">content</p><p title="title2">content2</p>'), 'title content title2 content2')
def test_alt_attribute(self): self.assertContainsSameWords( html_to_plaintext('<img alt="image description" />'), 'image description') self.assertContainsSameWords( html_to_plaintext('<p>content</p><img alt="image description" />'), 'content image description')
def test_comments(self): self.assertContainsSameWords( html_to_plaintext("<!-- comment --><p>content</p>"), 'content') self.assertContainsSameWords( html_to_plaintext("<p>content<!-- comment --></p>"), 'content', )
def test_alt_attribute(self): self.assertContainsSameWords( html_to_plaintext('<img alt="image description" />'), 'image description' ) self.assertContainsSameWords( html_to_plaintext('<p>content</p><img alt="image description" />'), 'content image description' )
def test_title_attribute(self): self.assertContainsSameWords( html_to_plaintext('<p title="title">content</p>'), 'title content' ) self.assertContainsSameWords( html_to_plaintext('<p title="title">content</p><p title="title2">content2</p>'), 'title content title2 content2' )
def test_comments(self): self.assertContainsSameWords( html_to_plaintext("<!-- comment --><p>content</p>"), 'content' ) self.assertContainsSameWords( html_to_plaintext("<p>content<!-- comment --></p>"), 'content', )
def test_non_text(self): self.assertContainsSameWords( html_to_plaintext("<script>javascript</script><p>content</p>"), 'content') self.assertContainsSameWords( html_to_plaintext("<style>css</style><p>content<p>"), 'content') self.assertContainsSameWords( html_to_plaintext( "<script>javascript<p>javascript</p></script><p>content</p>"), 'content')
def test_other_attributes(self): self.assertContainsSameWords( html_to_plaintext('<a href="http://example.com">example.com</a>'), 'example.com') self.assertContainsSameWords( html_to_plaintext( '<body onload="javascript:alert(\'hello\')">content</body>'), 'content') self.assertContainsSameWords( html_to_plaintext( '<span lang="en" onclick="javascript:void(0)">content</span>'), 'content')
def test_non_text(self): self.assertContainsSameWords( html_to_plaintext("<script>javascript</script><p>content</p>"), 'content' ) self.assertContainsSameWords( html_to_plaintext("<style>css</style><p>content<p>"), 'content' ) self.assertContainsSameWords( html_to_plaintext("<script>javascript<p>javascript</p></script><p>content</p>"), 'content' )
def test_other_attributes(self): self.assertContainsSameWords( html_to_plaintext('<a href="http://example.com">example.com</a>'), 'example.com' ) self.assertContainsSameWords( html_to_plaintext('<body onload="javascript:alert(\'hello\')">content</body>'), 'content' ) self.assertContainsSameWords( html_to_plaintext('<span lang="en" onclick="javascript:void(0)">content</span>'), 'content' )
def prepare(self, obj): self.prepared_data = super(BlogIndex, self).prepare(obj) request = fake_request() node = obj.content.get_published_node(request) if node is not None: # prepare() has to work on unpublished blogs because haystack # filters them out at query time, not index time. blog_layout = node.content ctx = { 'request': fake_request(), 'root_node_override': node, } html = render_root(ctx, obj, 'content') content = [ html_to_plaintext(html), blog_layout.title, blog_layout.summary, ] self.prepared_data['title'] = blog_layout.title self.prepared_data['text'] = ' '.join(content) self.prepared_data['get_absolute_url'] = obj.get_absolute_url_with_layout(blog_layout) return self.prepared_data
def prepare(self, obj): self.prepared_data = super(BlogIndex, self).prepare(obj) request = fake_request() node = obj.content.get_published_node(request) if node is not None: # prepare() has to work on unpublished blogs because haystack # filters them out at query time, not index time. blog_layout = node.content ctx = { 'request': fake_request(), 'root_node_override': node, } html = render_root(ctx, obj, 'content') content = [ html_to_plaintext(html), blog_layout.title, blog_layout.summary, ] self.prepared_data['title'] = blog_layout.title self.prepared_data['text'] = ' '.join(content) self.prepared_data[ 'get_absolute_url'] = obj.get_absolute_url_with_layout( blog_layout) return self.prepared_data
def prepare_text(self, obj): context = {'_current_page': obj.page_ptr, 'page': obj.page_ptr} html = render_root(context, obj, 'root_node') content = html_to_plaintext(html) keywords = ' '.join(self.prepare_keywords(obj)) return ' '.join([obj.title, keywords, obj.description, content])
def test_basic_conversion(self): self.assertContainsSameWords(html_to_plaintext("<p>content</p>"), 'content') self.assertContainsSameWords( html_to_plaintext("<p>content with multiple words</p>"), 'content with multiple words') self.assertContainsSameWords( html_to_plaintext("<p>multiple</p><p>content</p>"), 'multiple content') self.assertContainsSameWords( html_to_plaintext('''<div><p>complex</p><p>content</p></div> <p>with <a href="#">encapsulation</a></p>'''), 'complex content with encapsulation')
def prepare_text(self, obj): content = html_to_plaintext(obj.content) categories = ' '.join(prepare_attribute_list(obj, 'categories')) authors = ' '.join(prepare_attribute_list(obj, 'authors')) tags = ' '.join([force_text(k) for k in obj.tags_list]) return ' '.join([ obj.title, content, categories, authors, tags, obj.slug, obj.image_caption, obj.model_subtitle ])
def test_basic_conversion(self): self.assertContainsSameWords( html_to_plaintext("<p>content</p>"), 'content' ) self.assertContainsSameWords( html_to_plaintext("<p>content with multiple words</p>"), 'content with multiple words' ) self.assertContainsSameWords( html_to_plaintext("<p>multiple</p><p>content</p>"), 'multiple content' ) self.assertContainsSameWords( html_to_plaintext( '''<div><p>complex</p><p>content</p></div> <p>with <a href="#">encapsulation</a></p>''' ), 'complex content with encapsulation' )
def prepare(self, obj): self.prepared_data = super(BlogIndex, self).prepare(obj) request = fake_request() node = obj.content.get_published_node(request) blog_layout = node.content ctx = { 'request': fake_request(), 'root_node_override': node, } html = render_root(ctx, obj, 'content') content = [ html_to_plaintext(html), blog_layout.title, blog_layout.summary, ] self.prepared_data['title'] = blog_layout.title self.prepared_data['text'] = ' '.join(content) self.prepared_data['get_absolute_url'] = obj.get_absolute_url_with_layout(blog_layout) return self.prepared_data
def prepare(self, obj): self.prepared_data = super(BlogIndex, self).prepare(obj) request = fake_request() node = obj.content.get_published_node(request) blog_layout = node.content ctx = { 'request': fake_request(), 'root_node_override': node, } html = render_root(ctx, obj, 'content') content = [ html_to_plaintext(html), blog_layout.title, blog_layout.summary, ] self.prepared_data['title'] = blog_layout.title self.prepared_data['text'] = ' '.join(content) self.prepared_data[ 'get_absolute_url'] = obj.get_absolute_url_with_layout(blog_layout) return self.prepared_data
def prepare_text(self, obj): html = render_root({}, obj, "root_node") content = html_to_plaintext(html) keywords = " ".join(self.prepare_keywords(obj)) return " ".join([obj.title, keywords, obj.description, content])
def prepare_text(self, obj): context = {"_current_page": obj.page_ptr, "page": obj.page_ptr} html = render_root(context, obj, "root_node") content = html_to_plaintext(html) keywords = " ".join(self.prepare_keywords(obj)) return " ".join([obj.title, keywords, obj.description, content])