Exemple #1
0
    def process_coordinates(self, batch_path):
        logging.info("process word coordinates for batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
        else:
            batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"
        batch_name = _normalize_batch_name(batch_name)
        try:
            batch = self._get_batch(batch_name, batch_source, create=False)
            self.current_batch = batch
            for issue in batch.issues.all():
                for page in issue.pages.all():
                    url = urlparse.urljoin(self.current_batch.storage_url,
                                           page.ocr_filename)

                    lang_text, coords = ocr_extractor(url)
                    self._process_coordinates(page, coords)
        except Exception, e:
            msg = "unable to process coordinates for batch: %s" % e
            _logger.error(msg)
            _logger.exception(e)
            raise BatchLoaderException(msg)
Exemple #2
0
 def process_coordinates(self, batch_path):
     logging.info("process word coordinates for batch at %s", batch_path)
     dirname, batch_name = os.path.split(batch_path.rstrip("/"))
     if dirname:
         batch_source = None
     else:
         batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name)
         if not batch_source.endswith("/"):
             batch_source += "/"
     batch_name = _normalize_batch_name(batch_name)
     try:
         batch = self._get_batch(batch_name, batch_source, create=False)
         self.current_batch = batch
         for issue in batch.issues.all():
             for page in issue.pages.all():
                 if not page.ocr_filename:
                     logging.warn(
                         "Batch [%s] has page [%s] that has no OCR. Skipping processing coordinates for page."
                         % (batch_name, page))
                 else:
                     url = urlparse.urljoin(self.current_batch.storage_url,
                                            page.ocr_filename)
                     logging.debug("Extracting OCR from url %s" % url)
                     lang_text, coords = ocr_extractor(url)
                     self._process_coordinates(page, coords)
     except Exception, e:
         msg = "unable to process coordinates for batch: %s" % e
         _logger.error(msg)
         _logger.exception(e)
         raise BatchLoaderException(msg)
Exemple #3
0
    def process_ocr(self, page, index=True):
        _logger.debug("extracting ocr text and word coords for %s" %
            page.url)

        url = urlparse.urljoin(self.current_batch.storage_url,
                               page.ocr_filename)

        lang_text, coords = ocr_extractor(url)

        if self.PROCESS_COORDINATES:
            self._process_coordinates(page, coords)

        ocr = OCR()
        ocr.page = page
        ocr.save()
        for lang, text in lang_text.iteritems():
            try:
                language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang))
            except models.Language.DoesNotExist:
                # default to english as per requirement
                language = models.Language.objects.get(code='eng')
            ocr.language_texts.create(language=language,
                                      text=text)
        page.ocr = ocr
        if index:
            _logger.debug("indexing ocr for: %s" % page.url)
            self.solr.add(**page.solr_doc)
            page.indexed = True
        page.save()
Exemple #4
0
    def process_ocr(self, page, index=True):
        _logger.debug("extracting ocr text and word coords for %s" % page.url)

        url = urlparse.urljoin(self.current_batch.storage_url,
                               page.ocr_filename)

        lang_text, coords = ocr_extractor(url)

        if self.PROCESS_COORDINATES:
            self._process_coordinates(page, coords)

        ocr = OCR()
        ocr.page = page
        ocr.save()
        for lang, text in lang_text.iteritems():
            try:
                language = models.Language.objects.get(
                    Q(code=lang) | Q(lingvoj__iendswith=lang))
            except models.Language.DoesNotExist:
                # default to english as per requirement
                language = models.Language.objects.get(code='eng')
            ocr.language_texts.create(language=language, text=text)
        page.ocr = ocr
        if index:
            _logger.debug("indexing ocr for: %s" % page.url)
            self.solr.add(**page.solr_doc)
            page.indexed = True
        page.save()
 def process_coordinates(self, batch_path):
     LOGGER.info("process word coordinates for batch at %s", batch_path)
     dirname, batch_name = os.path.split(batch_path.rstrip("/"))
     if dirname:
         batch_source = None
     else:
         batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name)
         if not batch_source.endswith("/"):
             batch_source += "/"
     batch_name = _normalize_batch_name(batch_name)
     try:
         batch = self._get_batch(batch_name, batch_source, create=False)
         self.current_batch = batch
         for issue in batch.issues.all():
             for page in issue.pages.all():
                 if not page.ocr_filename:
                     LOGGER.warn("Batch [%s] has page [%s] that has no OCR. Skipping processing coordinates for page." % (batch_name, page))
                 else:
                     url = urlparse.urljoin(self.current_batch.storage_url,
                                            page.ocr_filename)
                     LOGGER.debug("Extracting OCR from url %s", url)
                     lang_text, coords = ocr_extractor(url)
                     self._process_coordinates(page, coords)
     except Exception as e:
         msg = "unable to process coordinates for batch: %s" % e
         LOGGER.exception(msg)
         raise BatchLoaderException(msg)
Exemple #6
0
    def process_ocr(self, page):
        LOGGER.debug("extracting ocr text and word coords for %s", page.url)

        url = urlparse.urljoin(self.current_batch.storage_url,
                               page.ocr_filename)

        lang_text, coords = ocr_extractor(url)

        if self.PROCESS_COORDINATES:
            self._process_coordinates(page, coords)

        ocr = OCR()
        ocr.page = page
        ocr.save()
        lang_text_solr = {}
        for lang, text in lang_text.iteritems():
            try:
                language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang))
            except models.Language.DoesNotExist:
                LOGGER.warn("Language %s does not exist in the database. Defaulting to English.", lang)
                # default to english as per requirement
                language = models.Language.objects.get(code='eng')
            ocr.language_texts.create(language=language)
            lang_text_solr[language.code] = text

        page.ocr = ocr
        page.lang_text = lang_text_solr
        page.save()
        return page
    def test_extractor(self):
        dir = join(dirname(dirname(__file__)), 'test-data')
        ocr_file = join(dir, 'ocr.xml')
        text, coord_info = ocr_extractor(ocr_file)
        coords = coord_info["coords"]
        expected_text = {"eng": file(join(dir, 'ocr.txt')).read().decode('utf-8')}

        self.assertEqual(text, expected_text)
        self.assertEqual(len(coords.keys()), 2489)
        self.assertEqual(len(coords['place']), 3)
    def test_extractor(self):
        dir = join(dirname(dirname(__file__)), 'test-data')
        ocr_file = join(dir, 'ocr.xml')
        text, coord_info = ocr_extractor(ocr_file)
        coords = coord_info["coords"]
        expected_text = {
            "eng": file(join(dir, 'ocr.txt')).read().decode('utf-8')
        }

        self.assertEqual(text, expected_text)
        self.assertEqual(len(coords.keys()), 2489)
        self.assertEqual(len(coords['place']), 3)
Exemple #9
0
    def test_extractor(self):
        dir = join(dirname(dirname(__file__)), 'test-data')
        ocr_file = join(dir, 'ocr.xml')
        text, coord_info = ocr_extractor(ocr_file)
        coords = coord_info["coords"]
        expected_text = {"eng": file(join(dir, 'ocr.txt')).read().decode('utf-8')}

        self.assertEqual(text, expected_text)
        self.assertEqual(len(coords.keys()), 2150)
        self.assertEqual(len(coords['place']), 3)
        # Craft. should be normalized to Craft
        # since Solr's highlighting will not include
        # trailing punctuation in highlighted text
        self.assertTrue(coords.has_key('Craft'))
        self.assertTrue(not coords.has_key('Craft.'))
Exemple #10
0
    def test_extractor(self):
        dir = join(dirname(dirname(__file__)), 'test-data')
        ocr_file = join(dir, 'ocr.xml')
        text, coord_info = ocr_extractor(ocr_file)
        coords = coord_info["coords"]
        expected_text = {
            "eng": file(join(dir, 'ocr.txt')).read().decode('utf-8')
        }

        self.assertEqual(text, expected_text)
        self.assertEqual(len(coords.keys()), 2150)
        self.assertEqual(len(coords['place']), 3)
        # Craft. should be normalized to Craft
        # since Solr's highlighting will not include
        # trailing punctuation in highlighted text
        self.assertTrue(coords.has_key('Craft'))
        self.assertTrue(not coords.has_key('Craft.'))
Exemple #11
0
    def solr_doc(self):
        date = self.issue.date_issued
        date = "%4i%02i%02i" % (date.year, date.month, date.day)

        # start with basic title data
        doc = self.issue.title.solr_doc
        # no real need to repeat this stuff in pages
        del doc["essay"]
        del doc["url"]
        del doc["holding_type"]
        doc.update({
            "id": self.url,
            "type": "page",
            "batch": self.issue.batch.name,
            "date": date,
            "page": self.number,
            "sequence": self.sequence,
            "section_label": self.section_label,
            "edition_label": self.issue.edition_label,
        })

        # This is needed when building the solr index.
        # TODO this is also used when visiting a page like http://127.0.0.1:8000/search/pages/results/?state=&date1=1789&date2=1963&proxtext=&x=0&y=0&dateFilterType=yearRange&rows=20&searchType=basic&format=json
        # In that case we might want to break it from using this and pull directly from SOLR for performance reasons
        # However, when ingesting a batch, ocr_abs_filename may not be set
        ocr_texts = self.lang_text
        if self.ocr_abs_filename is not None:
            logging.debug("extracting ocr for solr page")
            ocr_texts, _ = ocr_extractor(self.ocr_abs_filename)

        for lang, ocr_text in ocr_texts.items():
            # make sure Solr is configured to handle the language and if it's
            # not just treat it as English
            if lang not in settings.SOLR_LANGUAGES:
                lang = "eng"
            doc["ocr_%s" % lang] = ocr_text
        return doc
Exemple #12
0
    def solr_doc(self):
        date = self.issue.date_issued
        date = "%4i%02i%02i" % (date.year, date.month, date.day)

        # start with basic title data
        doc = self.issue.title.solr_doc
        # no real need to repeat this stuff in pages
        del doc['essay']
        del doc['url']
        del doc['holding_type']
        doc.update({
            'id': self.url,
            'type': 'page',
            'batch': self.issue.batch.name,
            'date': date,
            'page': self.number,
            'sequence': self.sequence,
            'section_label': self.section_label,
            'edition_label': self.issue.edition_label,
        })

        # This is needed when building the solr index.
        # TODO this is also used when visiting a page like http://127.0.0.1:8000/search/pages/results/?state=&date1=1789&date2=1963&proxtext=&x=0&y=0&dateFilterType=yearRange&rows=20&searchType=basic&format=json
        # In that case we might want to break it from using this and pull directly from SOLR for performance reasons
        # However, when ingesting a batch, ocr_abs_filename may not be set
        ocr_texts = self.lang_text
        if self.ocr_abs_filename is not None:
            logging.debug("extracting ocr for solr page")
            ocr_texts, _ = ocr_extractor(self.ocr_abs_filename)

        for lang, ocr_text in ocr_texts.items():
            # make sure Solr is configured to handle the language and if it's
            # not just treat it as English
            if lang not in settings.SOLR_LANGUAGES:
                lang = "eng"
            doc['ocr_%s' % lang] = ocr_text
        return doc