def execute(self, file_object, page_number): logger.debug('Parsing PDF page: %d', page_number) destination_descriptor, temp_filepath = tempfile.mkstemp( dir=setting_temporary_directory.value) copyfile(file_object, temp_filepath) command = [] command.append(self.pdftotext_path) command.append('-f') command.append(str(page_number)) command.append('-l') command.append(str(page_number)) command.append(temp_filepath) command.append('-') proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: logger.error(proc.stderr.readline()) raise ParserError output = proc.stdout.read() if output == b'\x0c': logger.debug('Parser didn\'t return any output') return '' if output[-3:] == b'\x0a\x0a\x0c': return output[:-3] return output
def parse(self, document_page, descriptor=None): logger.debug('parsing PDF with PopplerParser') pagenum = str(document_page.page_number) if descriptor: destination_descriptor, temp_filepath = tempfile.mkstemp( dir=TEMPORARY_DIRECTORY) copyfile(descriptor, temp_filepath) document_file = temp_filepath else: document_file = document_page.document.document_save_to_temp_dir( document_page.document.checksum) logger.debug('document_file: %s', document_file) logger.debug('parsing PDF page %s', pagenum) command = [] command.append(self.pdftotext_path) command.append('-f') command.append(pagenum) command.append('-l') command.append(pagenum) command.append(document_file) command.append('-') proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: logger.error(proc.stderr.readline()) raise ParserError output = proc.stdout.read() if output == b'\x0c': logger.debug('Parser didn\'t return any output') raise ParserError('No output') document_page.content = output document_page.page_label = _('Text extracted from PDF') document_page.save()
def parse(self, document_page, descriptor=None): logger.debug('parsing PDF with PopplerParser') pagenum = str(document_page.page_number) if descriptor: destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY) copyfile(descriptor, temp_filepath) document_file = temp_filepath else: document_file = document_page.document.document_save_to_temp_dir(document_page.document.checksum) logger.debug('document_file: %s', document_file) logger.debug('parsing PDF page %s', pagenum) command = [] command.append(self.pdftotext_path) command.append('-f') command.append(pagenum) command.append('-l') command.append(pagenum) command.append(document_file) command.append('-') proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) return_code = proc.wait() if return_code != 0: logger.error(proc.stderr.readline()) raise ParserError output = proc.stdout.read() if output == b'\x0c': logger.debug('Parser didn\'t return any output') raise ParserError('No output') document_page.content = output document_page.page_label = _('Text extracted from PDF') document_page.save()
def execute(self, file_object, page_number): logger.debug('Parsing PDF page: %d', page_number) destination_descriptor, temp_filepath = mkstemp() copyfile(file_object, temp_filepath) command = [] command.append(self.pdftotext_path) command.append('-f') command.append(str(page_number)) command.append('-l') command.append(str(page_number)) command.append(temp_filepath) command.append('-') proc = subprocess.Popen( command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE ) return_code = proc.wait() if return_code != 0: logger.error(proc.stderr.readline()) fs_cleanup(temp_filepath, file_descriptor=destination_descriptor) raise ParserError output = proc.stdout.read() fs_cleanup(temp_filepath, file_descriptor=destination_descriptor) if output == b'\x0c': logger.debug('Parser didn\'t return any output') return '' if output[-3:] == b'\x0a\x0a\x0c': return output[:-3] return output