def parse(self, option: str, url_or_path: str, server_endpoint: str = None, verbose: int = 0, tika_server_jar: str = None, response_mime_type: str = 'application/json', services: dict = None, raw_response: bool = False, extra_headers: Dict[str, str] = None) -> Dict: """ The method is called from parse_file_on_server to parse the file calling Tika as a server. :param option: command line options to send to Tika's server :param url_or_path: local path (or URL) to the file being parsed :param server_endpoint: Tika server's URL :param verbose: make Tika produse verbose log :param tika_server_jar: path to Tika's JAR file :param response_mime_type: response format (application/json) for plain text + metadata in JSON format :param services: :param raw_response: get raw response from Tika (text + metadata + warnings), False by default :param extra_headers: extra request header :return: dictionary with "content" (text) and "metadata" (another dictionary) keys """ services = services if services else \ {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'} tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path server_endpoint = server_endpoint if server_endpoint else self.server_endpoint path, file_type = getRemoteFile(url_or_path, self.tika_files_path) service = services.get(option, services['all']) if service == '/tika': response_mime_type = 'text/plain' content_path = self.make_content_disposition_header(path) headers = { 'Accept': response_mime_type, 'Content-Disposition': content_path } if extra_headers: headers = {**headers, **extra_headers} status, response = callServer('put', server_endpoint, service, open(path, 'rb'), headers, verbose, tika_server_jar, rawResponse=raw_response) if file_type == 'remote': os.unlink(path) return _parse((status, response))
def parse(self, option: str, url_or_path: str, server_endpoint: str = None, verbose: int = 0, tika_server_jar: str = None, response_mime_type: str = 'application/json', services: dict = None, raw_response: bool = False, extra_headers: Dict[str, str] = None) -> Dict: services = services if services else \ {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'} tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path server_endpoint = server_endpoint if server_endpoint else self.server_endpoint path, file_type = getRemoteFile(url_or_path, self.tika_files_path) service = services.get(option, services['all']) if service == '/tika': response_mime_type = 'text/plain' content_path = self.make_content_disposition_header(path) headers = { 'Accept': response_mime_type, 'Content-Disposition': content_path } if extra_headers: headers = {**headers, **extra_headers} status, response = callServer('put', server_endpoint, service, open(path, 'rb'), headers, verbose, tika_server_jar, rawResponse=raw_response) if file_type == 'remote': os.unlink(path) return _parse((status, response))