def main(): args = get_args() tika = TikaApp(args.jar or os.environ.get("TIKA_APP_JAR", None)) parameters = { "path": args.file, "payload": args.payload, "objectInput": sys.stdin if args.stdin else None } try: if args.detect: print(tika.detect_content_type(**parameters)) if args.text: print(tika.extract_only_content(**parameters)) if args.language: print(tika.detect_language(**parameters)) if args.all: parameters["pretty_print"] = True print(tika.extract_all_content(**parameters)) if args.metadata: parameters["pretty_print"] = True print(tika.extract_only_metadata(**parameters)) except IOError: pass
# -*- coding: utf-8 -*- # @Time : 2020/12/11 10:25 # @Author : 王西亚 # @File : c_doc.py from tikapp import TikaApp tika_client = TikaApp(file_jar="/usr/local/Cellar/tika/1.24.1_1/libexec/tika-app-1.24.1.jar") metadata = tika_client.extract_only_metadata("/Users/wangxiya/Downloads/000101020062805119-00.pdf") print(type(metadata)) print(metadata) # from tika import parser # parsed = parser.from_file('/path/to/file') # print(parsed["metadata"]) # print(parsed["content"]) # parsed = parser.from_file('/Users/wangxiya/Downloads/000101020062805119-00.pdf', 'http://localhost:9998/tika') # metadata = parsed["metadata"] # print(type(metadata)) # print(metadata)
from tikapp import TikaApp tika_client = TikaApp( file_jar="/Users/yma2/Documents/_garage/python/cxm/tika/tika-app-1.20.jar") analyzeFile = "/Users/yma2/Downloads/Azure_Developer_Guide_eBook_ja-JP.pdf" print(tika_client.detect_content_type(analyzeFile)) print(tika_client.detect_language(analyzeFile)) print(tika_client.extract_only_content(analyzeFile)) print(tika_client.extract_only_metadata(analyzeFile))
def process(self) -> str: """ 在这里提取文档数据的元数据, 将元数据文件存储在self.file_content.work_root_dir下, 固定名称为self.FileName_MetaData, 注意返回的串中有元数据的格式 注意: 如果出现内存泄漏现象, 则使用新建进程提取元数据, 放置到文件中, 在本进程中解析元数据!!! :return: """ default_result = super().process() out_metadata_file_fullname = CFile.join_file( self.file_content.work_root_dir, self.FileName_MetaData) in_file_fullname = self.file_info.file_name_with_full_path if not settings.application.xpath_one( self.Path_Setting_Dependence_Tika_Enable, True): return default_result tika_dependence_mode = settings.application.xpath_one( self.Path_Setting_Dependence_Tika_Mode, self.Name_Server) if CUtils.equal_ignore_case(tika_dependence_mode, self.Name_Server): tika_server_url = settings.application.xpath_one( self.Path_Setting_Dependence_Tika_Server_Url, None) tika_server_connect_timeout = settings.application.xpath_one( self.Path_Setting_Dependence_Tika_Server_Timeout, 30) if CUtils.equal_ignore_case(tika_server_url, ''): return default_result try: parsed = TikaServer.from_file( in_file_fullname, tika_server_url, requestOptions={'timeout': tika_server_connect_timeout}) meta_data_dict = parsed["metadata"] json_obj = CJson() json_obj.load_obj(meta_data_dict) json_obj.to_file(out_metadata_file_fullname) return CResult.merge_result_info( CResult.merge_result( self.Success, '文档[{0}]的元数据提取成功'.format(in_file_fullname)), self.Name_Format, self.MetaDataFormat_Json) except Exception as error: return CResult.merge_result( self.Failure, '文档[{0}]的元数据提取过程出现错误, 详细信息为: [{1}]'.format( in_file_fullname, error.__str__())) else: tika_application = settings.application.xpath_one( self.Path_Setting_Dependence_Tika_Client_App, None) if CUtils.equal_ignore_case(tika_application, ''): return default_result if not CFile.file_or_path_exist(tika_application): return CResult.merge_result( self.Failure, '文档[{0}]的元数据无法提取, 详细原因为: [依赖中间件{1}文件不存在, 请修正后重试!]'.format( in_file_fullname, tika_application)) try: tika_client = TikaApplication(file_jar=tika_application) meta_data_dict = tika_client.extract_only_metadata( in_file_fullname) json_obj = CJson() json_obj.load_obj(meta_data_dict) json_obj.to_file(out_metadata_file_fullname) return CResult.merge_result_info( CResult.merge_result( self.Success, '文档[{0}]的元数据提取成功'.format(in_file_fullname)), self.Name_Format, self.MetaDataFormat_Json) except Exception as error: return CResult.merge_result( self.Failure, '文档[{0}]的元数据提取过程出现错误, 详细信息为: [{1}]'.format( in_file_fullname, error.__str__())) # result = raster_mdreader.get_metadata_2_file(out_metadata_file_fullname) # result = CProcessUtils.processing_method(raster_mdreader.get_metadata_2_file, out_metadata_file_fullname) # 进程调用模式 # p_one = Process(target=raster_mdreader.get_metadata_2_file, args=(out_metadata_file_fullname,)) # p_one.start() # p_one.join() return CResult.merge_result_info(result, self.Name_Format, self.MetaDataFormat_Json)