Example #1
0
    def parse_file_local(self,
                         local_path: str,
                         original_file_name: str,
                         timeout: int = 60,
                         encoding_name: str = 'utf-8',
                         logger: ProcessLogger = None,
                         enable_ocr: bool = True) -> Dict:
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag

        cmd = self.tika_start_command_list + [
            '-J', '-t', f'-e{encoding_name}', local_path
        ]

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        text = read_output(cmd,
                           stderr_callback=err,
                           encoding=encoding_name,
                           timeout_sec=timeout) or ''

        try:
            return _parse((200, text))
        except Exception as ex:
            text_sample = text[:255] if text and isinstance(text,
                                                            str) else str(text)
            raise Exception(
                'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                text_sample) from ex
Example #2
0
    def parse(self,
              option: str,
              url_or_path: str,
              server_endpoint: str = None,
              verbose: int = 0,
              tika_server_jar: str = None,
              response_mime_type: str = 'application/json',
              services: dict = None,
              raw_response: bool = False,
              extra_headers: Dict[str, str] = None) -> Dict:
        """
        The method is called from parse_file_on_server to parse the file
        calling Tika as a server.
        :param option: command line options to send to Tika's server
        :param url_or_path: local path (or URL) to the file being parsed
        :param server_endpoint: Tika server's URL
        :param verbose: make Tika produse verbose log
        :param tika_server_jar: path to Tika's JAR file
        :param response_mime_type: response format (application/json) for plain text + metadata in JSON format
        :param services:
        :param raw_response: get raw response from Tika (text + metadata + warnings), False by default
        :param extra_headers: extra request header
        :return: dictionary with "content" (text) and "metadata" (another dictionary) keys
        """

        services = services if services else \
            {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}
        tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path
        server_endpoint = server_endpoint if server_endpoint else self.server_endpoint

        path, file_type = getRemoteFile(url_or_path, self.tika_files_path)
        service = services.get(option, services['all'])
        if service == '/tika':
            response_mime_type = 'text/plain'
        content_path = self.make_content_disposition_header(path)

        headers = {
            'Accept': response_mime_type,
            'Content-Disposition': content_path
        }
        if extra_headers:
            headers = {**headers, **extra_headers}

        status, response = callServer('put',
                                      server_endpoint,
                                      service,
                                      open(path, 'rb'),
                                      headers,
                                      verbose,
                                      tika_server_jar,
                                      rawResponse=raw_response)

        if file_type == 'remote':
            os.unlink(path)
        return _parse((status, response))
    def parse_file_local_plain_text(self,
                                    local_path: str,
                                    original_file_name: str,
                                    task: Any,
                                    timeout: int = 60,
                                    encoding_name: str = 'utf-8',
                                    logger: ProcessLogger = None,
                                    enable_ocr: bool = True) -> MarkedUpText:
        """
        Parses file (*.pdf, *.doc, *.docx, *.rtf, ...) calling Tika as a Java local process.
        Tika will use plain text "stripper" and transform the source document into plain text
        inside its (Java) process.
        :param local_path: local path to the file being parsed
        :param original_file_name: original file name, can differ from local_path (e.g. temporary file path)
        :param timeout: timeout to interrupt Java process in seconds
        :param encoding_name: encoding to use, is passed to Tika
        :param logger: logger object to write errors and warnings
        :param enable_ocr: allow (True) converting images to text
        :return: MarkedUpText: text + metadata
        """
        mode_flag = self.TIKA_MODE_OCR if enable_ocr else self.TIKA_MODE_PREFER_TEXT
        # don't use at all TIKA_MODE_PDF_ONLY
        os.environ[self.TIKA_ENV_VAR_FLAG_MODE] = mode_flag
        os.environ[self.TIKA_PARSER_DETAIL] = ''

        tika_default_command_list = self.tika_lexnlp_default_command_list
        if enable_ocr is False and self.tika_noocr_default_command_list is not None:
            tika_default_command_list = self.tika_noocr_default_command_list
        cmd = tika_default_command_list + [
            '-J', '-t', f'-e{encoding_name}', local_path
        ]

        def err(line):
            logger.info(f'TIKA parsing {original_file_name}:\n{line}')

        logger.info(f'Tika (plain text) args: {", ".join(cmd)}')

        text = read_output(cmd,
                           stderr_callback=err,
                           encoding=encoding_name,
                           timeout_sec=timeout,
                           task=task) or ''

        try:
            ptr_val = _parse((200, text))
            return MarkedUpText(text=ptr_val['content'],
                                meta=ptr_val['metadata'])
        except Exception as ex:
            text_sample = text[:255] if text and isinstance(text,
                                                            str) else str(text)
            raise Exception(
                'Error in parse_default_pdf_ocr -> _parse(). Text:\n' +
                text_sample) from ex
Example #4
0
    def parse(self,
              option: str,
              url_or_path: str,
              server_endpoint: str = None,
              verbose: int = 0,
              tika_server_jar: str = None,
              response_mime_type: str = 'application/json',
              services: dict = None,
              raw_response: bool = False,
              extra_headers: Dict[str, str] = None) -> Dict:

        services = services if services else \
            {'meta': '/meta', 'text': '/tika', 'all': '/rmeta/text'}
        tika_server_jar = tika_server_jar if tika_server_jar else self.tika_jar_path
        server_endpoint = server_endpoint if server_endpoint else self.server_endpoint

        path, file_type = getRemoteFile(url_or_path, self.tika_files_path)
        service = services.get(option, services['all'])
        if service == '/tika':
            response_mime_type = 'text/plain'
        content_path = self.make_content_disposition_header(path)

        headers = {
            'Accept': response_mime_type,
            'Content-Disposition': content_path
        }
        if extra_headers:
            headers = {**headers, **extra_headers}

        status, response = callServer('put',
                                      server_endpoint,
                                      service,
                                      open(path, 'rb'),
                                      headers,
                                      verbose,
                                      tika_server_jar,
                                      rawResponse=raw_response)

        if file_type == 'remote':
            os.unlink(path)
        return _parse((status, response))