def prepare(self, obj): """Open the expected .html file and extract body and title to index""" data = super(ImportedFileIndex, self).prepare(obj) html = self.get_file_content(obj) extractor = HtmlExtractor(html) # inject the object to the template with the rich content extracted context = { 'object': obj, 'title': extractor.title, 'body': extractor.content } data['title'] = context['title'] data['body'] = context['body'] data['text'] = self.fields['text'].prepare_template(context) logger.info('Search Index: indexing file project=%s path=%s', obj.project_id, obj.path) return data
def test_strip_all(self): extractor = HtmlExtractor('') self.assertEqual(extractor.strip_all('<div>tag</div>'), 'tag')
def test_strip_all_no_document(self): extractor = HtmlExtractor('') self.assertEqual(extractor.strip_all(''), '')
def test_title_raise_value_error(self): extractor = HtmlExtractor("<h1></h1>") extractor.doc = Mock(side_effect=ValueError()) self.assertIsNone(extractor.title)
def test_title_no_header(self): doc = "<div>Dummy Title</div>" extractor = HtmlExtractor(doc) self.assertIsNone(extractor.title)
def test_title_no_doc(self): extractor = HtmlExtractor("") self.assertIsNone(extractor.title)
def test_title(self): doc = "<h1>Dummy Title</h1>" extractor = HtmlExtractor(doc) self.assertEqual(extractor.title, "Dummy Title")
def test_content_raise_value_error(self): extractor = HtmlExtractor("<div></div>") extractor.doc = Mock(side_effect=ValueError()) self.assertIsNone(extractor.content)
def test_content_no_body(self): doc = "<div>Dummy Content</div>" extractor = HtmlExtractor(doc) self.assertIsNone(extractor.content)
def test_content_no_doc(self): extractor = HtmlExtractor("") self.assertIsNone(extractor.content)
def test_content(self): doc = "<body>Dummy Content</body>" extractor = HtmlExtractor(doc) self.assertEqual(extractor.content, "Dummy Content")
def test_init_with_empty_document(self): extractor = HtmlExtractor("") self.assertIsNone(extractor.doc)
def test_init(self): extractor = HtmlExtractor("<div></div>") self.assertIsNotNone(extractor.doc) self.assertIsInstance(extractor.doc, PyQuery)