Example #1
0
    def process_coordinates(self, batch_path):
        _logger.info("process word coordinates for batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
        else:
            batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE,
                                                batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"
        batch_name = _normalize_batch_name(batch_name)
        try:
            batch = self._get_batch(batch_name, batch_source, create=False)
            self.current_batch = batch
            for issue in batch.issues.all():
                for page in issue.pages.all():
                    url = urllib.parse.urljoin(self.current_batch.storage_url,
                                               page.ocr_filename)

                    lang_text, coords = ocr_extractor(url)
                    self._process_coordinates(page, coords)
        except Exception as e:
            msg = "unable to process coordinates for batch: %s" % e
            _logger.error(msg)
            _logger.exception(e)
            raise BatchLoaderException(msg)
Example #2
0
    def process_ocr(self, page, index=True):
        _logger.debug("extracting ocr text and word coords for %s" % page.url)

        url = urllib.parse.urljoin(self.current_batch.storage_url,
                                   page.ocr_filename)

        lang_text, coords = ocr_extractor(url)

        if self.PROCESS_COORDINATES:
            self._process_coordinates(page, coords)

        ocr = OCR()
        ocr.page = page
        ocr.save()
        for lang, text in lang_text.items():
            try:
                language = models.Language.objects.get(
                    Q(code=lang) | Q(lingvoj__iendswith=lang))
            except models.Language.DoesNotExist:
                # default to english as per requirement
                language = models.Language.objects.get(code='eng')
            ocr.language_texts.create(language=language, text=text)
        page.ocr = ocr
        if index:
            _logger.debug("indexing ocr for: %s" % page.url)
            self.solr.add(**page.solr_doc)
            page.indexed = True
        page.save()
Example #3
0
    def process_coordinates(self, batch_path):
        logging.info("process word coordinates for batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path.rstrip("/"))
        if dirname:
            batch_source = None
        else:
            batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"
        batch_name = _normalize_batch_name(batch_name)
        try:
            batch = self._get_batch(batch_name, batch_source, create=False)
            self.current_batch = batch
            for issue in batch.issues.all():
                for page in issue.pages.all():
                    url = urlparse.urljoin(self.current_batch.storage_url,
                                           page.ocr_filename)

                    lang_text, coords = ocr_extractor(url)
                    self._process_coordinates(page, coords)
        except Exception, e:
            msg = "unable to process coordinates for batch: %s" % e
            _logger.error(msg)
            _logger.exception(e)
            raise BatchLoaderException(msg)
Example #4
0
    def process_ocr(self, page, index=True):
        _logger.debug("extracting ocr text and word coords for %s" %
            page.url)

        url = urlparse.urljoin(self.current_batch.storage_url,
                               page.ocr_filename)

        lang_text, coords = ocr_extractor(url)

        if self.PROCESS_COORDINATES:
            self._process_coordinates(page, coords)

        ocr = OCR()
        ocr.page = page
        ocr.save()
        for lang, text in lang_text.iteritems():
            try:
                language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang))
            except models.Language.DoesNotExist:
                # default to english as per requirement
                language = models.Language.objects.get(code='eng')
            ocr.language_texts.create(language=language,
                                      text=text)
        page.ocr = ocr
        if index:
            _logger.debug("indexing ocr for: %s" % page.url)
            self.solr.add(**page.solr_doc)
            page.indexed = True
        page.save()
Example #5
0
    def test_extractor(self):
        dir = join(dirname(dirname(__file__)), 'test-data')
        ocr_file = join(dir, 'ocr.xml')
        text, coord_info = ocr_extractor(ocr_file)
        coords = coord_info["coords"]
        expected_text = {"eng": open(join(dir, 'ocr.txt'), encoding='utf-8').read()}

        self.assertEqual(text, expected_text)
        self.assertEqual(len(list(coords.keys())), 2150)
        self.assertEqual(len(coords['place']), 3)
        # Craft. should be normalized to Craft
        # since Solr's highlighting will not include
        # trailing punctuation in highlighted text
        self.assertTrue('Craft' in coords)
        self.assertTrue('Craft.' not in coords)
Example #6
0
    def test_extractor(self):
        dir = join(dirname(dirname(__file__)), "test-data")
        ocr_file = join(dir, "ocr.xml")
        text, coord_info = ocr_extractor(ocr_file)
        coords = coord_info["coords"]
        expected_text = {"eng": file(join(dir, "ocr.txt")).read().decode("utf-8")}

        self.assertEqual(text, expected_text)
        self.assertEqual(len(coords.keys()), 2150)
        self.assertEqual(len(coords["place"]), 3)
        # Craft. should be normalized to Craft
        # since Solr's highlighting will not include
        # trailing punctuation in highlighted text
        self.assertTrue(coords.has_key("Craft"))
        self.assertTrue(not coords.has_key("Craft."))