def main(): args = get_args() tika = TikaApp(args.jar or os.environ.get("TIKA_APP_JAR", None)) parameters = { "path": args.file, "payload": args.payload, "objectInput": sys.stdin if args.stdin else None } try: if args.detect: print(tika.detect_content_type(**parameters)) if args.text: print(tika.extract_only_content(**parameters)) if args.language: print(tika.detect_language(**parameters)) if args.all: parameters["pretty_print"] = True print(tika.extract_all_content(**parameters)) if args.metadata: parameters["pretty_print"] = True print(tika.extract_only_metadata(**parameters)) except IOError: pass
def main(): args = get_args() command_line = dict() if args.jar: command_line = {"TIKA_APP_JAR": args.jar} defaults = {"TIKA_APP_JAR": "/opt/tika/tika-app-1.15.jar"} options = ChainMap(command_line, os.environ, defaults) tika = TikaApp(options['TIKA_APP_JAR']) try: if args.file: f = args.file if args.detect: print(tika.detect_content_type(path=f)) if args.text: print(tika.extract_only_content(path=f)) if args.language: print(tika.detect_language(path=f)) if args.all: print(tika.extract_all_content(path=f, pretty_print=True)) elif args.payload: p = args.payload if args.detect: print(tika.detect_content_type(payload=p)) if args.text: print(tika.extract_only_content(payload=p)) if args.language: print(tika.detect_language(payload=p)) if args.all: print(tika.extract_all_content(payload=p, pretty_print=True)) except IOError: pass
class TikaReader(object): def __init__(self, path): self.tika_client = TikaApp(file_jar=path) def detect_type(self, doc): return self.tika_client.detect_content_type(doc) def detect_language(self, doc): return self.tika_client.detect_language(doc) def content(self, doc): return self.tika_client.extract_all_content(doc)
class TikaReader: # Iniciador de la clase. def __init__(self, file_process): # Cliente Tika que utiliza que carga el fichero jar cliente. self.tika_client = TikaApp(file_jar="tika-app-1.20.jar") self.file_process = file_process # Detector del tipo de contenido MIME. def detect_document_type(self): return self.tika_client.detect_content_type(self.file_process) # Detector de lenguaje utilizado en el documento. def detect_language(self): return self.tika_client.detect_language(self.file_process) # Extractor del contenido completo del documento. def extract_complete_info(self, value=False): return self.tika_client.extract_all_content(self.file_process, convert_to_obj=value) # Extractor de solo el contenido del documento. def extract_content_info(self): return self.tika_client.extract_only_content(self.file_process)
from tikapp import TikaApp tika_client = TikaApp( file_jar="/Users/yma2/Documents/_garage/python/cxm/tika/tika-app-1.20.jar") analyzeFile = "/Users/yma2/Downloads/Azure_Developer_Guide_eBook_ja-JP.pdf" print(tika_client.detect_content_type(analyzeFile)) print(tika_client.detect_language(analyzeFile)) print(tika_client.extract_only_content(analyzeFile)) print(tika_client.extract_only_metadata(analyzeFile))
def tika_detect_language(): tika_client = TikaApp(file_jar=TIKA_APP_JAR) output = tika_client.detect_language(path=test_zip) return output