Beispiel #1
0
 def get_image_cache_name(self, page):
     cache_file_path, transformations = self.get_cached_image_name(page)
     if os.path.exists(cache_file_path):
         return cache_file_path
     else:
         document_file = document_save_to_temp_dir(self, self.checksum)
         return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations)
Beispiel #2
0
    def parse(self, document_page, descriptor=None):
        logger.debug('executing')
        try:
            office_converter = OfficeConverter()
            document_file = document_save_to_temp_dir(
                document_page.document, document_page.document.checksum)
            logger.debug('document_file: %s', document_file)

            office_converter.convert(
                document_file, mimetype=document_page.document.file_mimetype)
            if office_converter.exists:
                input_filepath = office_converter.output_filepath
                logger.debug('office_converter.output_filepath: %s',
                             input_filepath)

                # Now that the office document has been converted to PDF
                # call the coresponding PDF parser in this new file
                parse_document_page(document_page,
                                    descriptor=open(input_filepath),
                                    mimetype=u'application/pdf')
            else:
                raise ParserError

        except OfficeConversionError, msg:
            logger.error(msg)
            raise ParserError
Beispiel #3
0
def convert_document_for_ocr(document, page=DEFAULT_PAGE_INDEX_NUMBER, file_format=DEFAULT_OCR_FILE_FORMAT):
    #Extract document file
    input_filepath = document_save_to_temp_dir(document, document.uuid)

    #Convert for OCR
    temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
    temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
    transformation_output_file = u'%s_trans%s%s%s' % (temp_path, page, os.extsep, file_format)
    unpaper_input_file = u'%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
    unpaper_output_file = u'%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
    convert_output_file = u'%s_ocr%s%s%s' % (temp_path, page, os.extsep, file_format)

    input_arg = u'%s[%s]' % (input_filepath, page)

    try:
        document_page = document.documentpage_set.get(page_number=page + 1)
        transformation_string, warnings = document_page.get_transformation_string()

        #Apply default transformations
        backend.execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=transformation_string, output_filepath=transformation_output_file)
        #Do OCR operations
        backend.execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
        # Process by unpaper
        execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
        # Convert to tif
        backend.execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
    finally:
        cleanup(transformation_output_file)
        cleanup(unpaper_input_file)
        cleanup(unpaper_output_file)

    return convert_output_file
Beispiel #4
0
 def get_image_cache_name(self, page, version):
     cache_file_path, transformations = self.get_cached_image_name(page, version)
     if os.path.exists(cache_file_path):
         return cache_file_path
     else:
         document_version = DocumentVersion.objects.get(pk=version)
         document_file = document_save_to_temp_dir(document_version, document_version.checksum)
         return convert(document_file, output_filepath=cache_file_path, page=page, transformations=transformations, mimetype=self.file_mimetype)
Beispiel #5
0
def convert_document_for_ocr(document, page=0, format='tif'):
    #Extract document file
    input_filepath = document_save_to_temp_dir(document, document.uuid)
            
    #Convert for OCR
    temp_filename, separator = os.path.splitext(os.path.basename(input_filepath))
    temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
    transformation_output_file = '%s_trans%s%s%s' % (temp_path, page, os.extsep, format)
    unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
    unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page, os.extsep)
    convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)
    
    input_arg = '%s[%s]' % (input_filepath, page)

    transformation_list = []
    try:
        #Catch invalid or non existing pages
        document_page = document.documentpage_set.get(document=document, page_number=page+1)
        for page_transformation in document_page.documentpagetransformation_set.all():
            try:
                if page_transformation.transformation in TRANFORMATION_CHOICES:
                    output = TRANFORMATION_CHOICES[page_transformation.transformation] % eval(page_transformation.arguments)
                    transformation_list.append(output)
            except Exception, e:
                if request.user.is_staff:
                    messages.warning(request, _(u'Error for transformation %(transformation)s:, %(error)s') % 
                        {'transformation':page_transformation.get_transformation_display(),
                        'error':e})
                else:
                    pass
    except ObjectDoesNotExist:
        pass

    tranformation_string = ' '.join(transformation_list)
    try:
        #Apply default transformations
        execute_convert(input_filepath=input_arg, quality=QUALITY_HIGH, arguments=tranformation_string, output_filepath=transformation_output_file)
        #Do OCR operations
        execute_convert(input_filepath=transformation_output_file, arguments=OCR_OPTIONS, output_filepath=unpaper_input_file)
        # Process by unpaper
        execute_unpaper(input_filepath=unpaper_input_file, output_filepath=unpaper_output_file)
        # Convert to tif
        execute_convert(input_filepath=unpaper_output_file, output_filepath=convert_output_file)
    finally:
        cleanup(transformation_output_file)
        cleanup(unpaper_input_file)
        cleanup(unpaper_output_file)
        return convert_output_file
Beispiel #6
0
def office_parser(document_page):
    logger.debug('executing')
    try:
        office_converter = OfficeConverter()
        document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
        logger.debug('document_file: %s', document_file)
        
        office_converter.convert(document_file, mimetype=document_page.document.file_mimetype)
        if office_converter.exists:
            input_filepath = office_converter.output_filepath
            logger.debug('office_converter.output_filepath: %s', input_filepath)

            pdf_parser(document_page, descriptor=open(input_filepath))
        else:
            raise ParserError

    except OfficeConversionError, msg:
        print msg
        raise ParserError
Beispiel #7
0
    def parse(self, document_page, descriptor=None):
        logger.debug('parsing PDF with PopplerParser')
        pagenum = str(document_page.page_number)

        if descriptor:
            destination_descriptor, temp_filepath = tempfile.mkstemp(
                dir=TEMPORARY_DIRECTORY)
            copyfile(descriptor, temp_filepath)
            document_file = temp_filepath
        else:
            document_file = document_save_to_temp_dir(
                document_page.document, document_page.document.checksum)

        logger.debug('document_file: %s', document_file)

        logger.debug('parsing PDF page %s' % pagenum)

        command = []
        command.append(self.pdftotext_path)
        command.append('-f')
        command.append(pagenum)
        command.append('-l')
        command.append(pagenum)
        command.append(document_file)
        command.append('-')

        proc = subprocess.Popen(command,
                                close_fds=True,
                                stderr=subprocess.PIPE,
                                stdout=subprocess.PIPE)
        return_code = proc.wait()
        if return_code != 0:
            logger.error(proc.stderr.readline())
            raise ParserError

        output = proc.stdout.read()
        if output == '\x0c':
            logger.debug('Parser didn\'t any output')
            raise ParserError('No output')

        document_page.content = output
        document_page.page_label = _(u'Text extracted from PDF')
        document_page.save()
Beispiel #8
0
    def parse(self, document_page, descriptor=None):
        logger.debug('executing')
        try:
            office_converter = OfficeConverter()
            document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
            logger.debug('document_file: %s', document_file)

            office_converter.convert(document_file, mimetype=document_page.document.file_mimetype)
            if office_converter.exists:
                input_filepath = office_converter.output_filepath
                logger.debug('office_converter.output_filepath: %s', input_filepath)

                # Now that the office document has been converted to PDF
                # call the coresponding PDF parser in this new file
                parse_document_page(document_page, descriptor=open(input_filepath), mimetype=u'application/pdf')
            else:
                raise ParserError

        except OfficeConversionError, msg:
            logger.error(msg)
            raise ParserError
Beispiel #9
0
def office_parser(document_page):
    logger.debug('executing')
    try:
        office_converter = OfficeConverter()
        document_file = document_save_to_temp_dir(
            document_page.document, document_page.document.checksum)
        logger.debug('document_file: %s', document_file)

        office_converter.convert(document_file,
                                 mimetype=document_page.document.file_mimetype)
        if office_converter.exists:
            input_filepath = office_converter.output_filepath
            logger.debug('office_converter.output_filepath: %s',
                         input_filepath)

            pdf_parser(document_page, descriptor=open(input_filepath))
        else:
            raise ParserError

    except OfficeConversionError, msg:
        print msg
        raise ParserError
Beispiel #10
0
    def parse(self, document_page, descriptor=None): 
        logger.debug('parsing PDF with PopplerParser') 
        pagenum = str(document_page.page_number) 

        if descriptor:
            destination_descriptor, temp_filepath = tempfile.mkstemp(dir=TEMPORARY_DIRECTORY)
            copyfile(descriptor, temp_filepath)
            document_file = temp_filepath
        else:
            document_file = document_save_to_temp_dir(document_page.document, document_page.document.checksum)
        
        logger.debug('document_file: %s', document_file)

        logger.debug('parsing PDF page %s' % pagenum) 

        command = [] 
        command.append(self.pdftotext_path) 
        command.append('-f') 
        command.append(pagenum) 
        command.append('-l') 
        command.append(pagenum) 
        command.append(document_file) 
        command.append('-') 

        proc = subprocess.Popen(command, close_fds=True, stderr=subprocess.PIPE, stdout=subprocess.PIPE) 
        return_code = proc.wait() 
        if return_code != 0: 
            logger.error(proc.stderr.readline())
            raise ParserError

        output = proc.stdout.read()
        if output == '\x0c':
            logger.debug('Parser didn\'t any output')
            raise ParserError('No output')

        document_page.content = output 
        document_page.page_label = _(u'Text extracted from PDF') 
        document_page.save() 
Beispiel #11
0
def convert_document(document, *args, **kwargs):
    document_filepath = create_image_cache_filename(document.checksum, *args, **kwargs)
    if os.path.exists(document_filepath):
        return document_filepath

    return convert(document_save_to_temp_dir(document, document.checksum), *args, **kwargs)
Beispiel #12
0
def convert_document_for_ocr(document, page=0, format='tif'):
    #Extract document file
    input_filepath = document_save_to_temp_dir(document, document.uuid)

    #Convert for OCR
    temp_filename, separator = os.path.splitext(
        os.path.basename(input_filepath))
    temp_path = os.path.join(TEMPORARY_DIRECTORY, temp_filename)
    transformation_output_file = '%s_trans%s%s%s' % (temp_path, page,
                                                     os.extsep, format)
    unpaper_input_file = '%s_unpaper_in%s%spnm' % (temp_path, page, os.extsep)
    unpaper_output_file = '%s_unpaper_out%s%spnm' % (temp_path, page,
                                                     os.extsep)
    convert_output_file = '%s_ocr%s%s%s' % (temp_path, page, os.extsep, format)

    input_arg = '%s[%s]' % (input_filepath, page)

    transformation_list = []
    try:
        #Catch invalid or non existing pages
        document_page = document.documentpage_set.get(document=document,
                                                      page_number=page + 1)
        for page_transformation in document_page.documentpagetransformation_set.all(
        ):
            try:
                if page_transformation.transformation in TRANFORMATION_CHOICES:
                    output = TRANFORMATION_CHOICES[
                        page_transformation.transformation] % eval(
                            page_transformation.arguments)
                    transformation_list.append(output)
            except Exception, e:
                if request.user.is_staff:
                    messages.warning(
                        request,
                        _(u'Error for transformation %(transformation)s:, %(error)s'
                          ) % {
                              'transformation':
                              page_transformation.get_transformation_display(),
                              'error':
                              e
                          })
                else:
                    pass
    except ObjectDoesNotExist:
        pass

    tranformation_string = ' '.join(transformation_list)
    try:
        #Apply default transformations
        execute_convert(input_filepath=input_arg,
                        quality=QUALITY_HIGH,
                        arguments=tranformation_string,
                        output_filepath=transformation_output_file)
        #Do OCR operations
        execute_convert(input_filepath=transformation_output_file,
                        arguments=OCR_OPTIONS,
                        output_filepath=unpaper_input_file)
        # Process by unpaper
        execute_unpaper(input_filepath=unpaper_input_file,
                        output_filepath=unpaper_output_file)
        # Convert to tif
        execute_convert(input_filepath=unpaper_output_file,
                        output_filepath=convert_output_file)
    finally:
        cleanup(transformation_output_file)
        cleanup(unpaper_input_file)
        cleanup(unpaper_output_file)
        return convert_output_file