Esempio n. 1
0
    def execute(self, file_object, page_number):
        logger.debug('Parsing PDF page: %d', page_number)

        destination_descriptor, temp_filepath = tempfile.mkstemp(
            dir=setting_temporary_directory.value)
        copyfile(file_object, temp_filepath)

        command = []
        command.append(self.pdftotext_path)
        command.append('-f')
        command.append(str(page_number))
        command.append('-l')
        command.append(str(page_number))
        command.append(temp_filepath)
        command.append('-')

        proc = subprocess.Popen(command,
                                close_fds=True,
                                stderr=subprocess.PIPE,
                                stdout=subprocess.PIPE)
        return_code = proc.wait()
        if return_code != 0:
            logger.error(proc.stderr.readline())
            raise ParserError

        output = proc.stdout.read()

        if output == b'\x0c':
            logger.debug('Parser didn\'t return any output')
            return ''

        if output[-3:] == b'\x0a\x0a\x0c':
            return output[:-3]

        return output
Esempio n. 2
0
    def parse(self, document_page, descriptor=None):
        logger.debug('parsing PDF with PopplerParser')
        pagenum = str(document_page.page_number)

        if descriptor:
            destination_descriptor, temp_filepath = tempfile.mkstemp(
                dir=TEMPORARY_DIRECTORY)
            copyfile(descriptor, temp_filepath)
            document_file = temp_filepath
        else:
            document_file = document_page.document.document_save_to_temp_dir(
                document_page.document.checksum)

        logger.debug('document_file: %s', document_file)

        logger.debug('parsing PDF page %s', pagenum)

        command = []
        command.append(self.pdftotext_path)
        command.append('-f')
        command.append(pagenum)
        command.append('-l')
        command.append(pagenum)
        command.append(document_file)
        command.append('-')

        proc = subprocess.Popen(command,
                                close_fds=True,
                                stderr=subprocess.PIPE,
                                stdout=subprocess.PIPE)
        return_code = proc.wait()
        if return_code != 0:
            logger.error(proc.stderr.readline())
            raise ParserError

        output = proc.stdout.read()
        if output == b'\x0c':
            logger.debug('Parser didn\'t return any output')
            raise ParserError('No output')

        document_page.content = output
        document_page.page_label = _('Text extracted from PDF')
        document_page.save()
Esempio n. 3
0
    def parse(self, document_page, descriptor=None):
        logger.debug('parsing PDF with PopplerParser')
        pagenum = str(document_page.page_number)

        if descriptor:
            destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY)
            copyfile(descriptor, temp_filepath)
            document_file = temp_filepath
        else:
            document_file = document_page.document.document_save_to_temp_dir(document_page.document.checksum)

        logger.debug('document_file: %s', document_file)

        logger.debug('parsing PDF page %s', pagenum)

        command = []
        command.append(self.pdftotext_path)
        command.append('-f')
        command.append(pagenum)
        command.append('-l')
        command.append(pagenum)
        command.append(document_file)
        command.append('-')

        proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE)
        return_code = proc.wait()
        if return_code != 0:
            logger.error(proc.stderr.readline())
            raise ParserError

        output = proc.stdout.read()
        if output == b'\x0c':
            logger.debug('Parser didn\'t return any output')
            raise ParserError('No output')

        document_page.content = output
        document_page.page_label = _('Text extracted from PDF')
        document_page.save()
Esempio n. 4
0
    def execute(self, file_object, page_number):
        logger.debug('Parsing PDF page: %d', page_number)

        destination_descriptor, temp_filepath = mkstemp()
        copyfile(file_object, temp_filepath)

        command = []
        command.append(self.pdftotext_path)
        command.append('-f')
        command.append(str(page_number))
        command.append('-l')
        command.append(str(page_number))
        command.append(temp_filepath)
        command.append('-')

        proc = subprocess.Popen(
            command, close_fds=True, stderr=subprocess.PIPE,
            stdout=subprocess.PIPE
        )
        return_code = proc.wait()
        if return_code != 0:
            logger.error(proc.stderr.readline())
            fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)

            raise ParserError

        output = proc.stdout.read()
        fs_cleanup(temp_filepath, file_descriptor=destination_descriptor)

        if output == b'\x0c':
            logger.debug('Parser didn\'t return any output')
            return ''

        if output[-3:] == b'\x0a\x0a\x0c':
            return output[:-3]

        return output