def extract_from_file(sender, instance, **kwargs): if Tika.ping(): tika_handle = Tika.from_file(instance.original_file.path) doc_text = UnicodeDammit(tika_handle.text()).unicode_markup instance.extracted_text = doc_text instance.extracted_html = tika_handle.html() instance.metadata = tika_handle.meta()
def tika_handle(self): return (Tika.ping() and self.path) \ and Tika.from_file(self.path) \ or None