Python get_page_text Examples, chronam.core.index.get_page_text Python Examples

Example #1

0

Show file

def page_ocr_txt(request, lccn, date, edition, sequence):
    title, issue, page = _get_tip(lccn, date, edition, sequence)
    try:
        text = get_page_text(page)
    except models.OCR.DoesNotExist:
        raise Http404("No OCR for %s" % page)

    response = HttpResponse(text, content_type='text/plain')
    return add_cache_tag(response, "lccn=%s" % lccn)

Example #2

0

Show file

File: browse.py Project: LibraryOfCongress/chronam

def page_ocr_txt(request, lccn, date, edition, sequence):
    title, issue, page = _get_tip(lccn, date, edition, sequence)
    try:
        text = get_page_text(page)
    except models.OCR.DoesNotExist:
        raise Http404("No OCR for %s" % page)

    response = HttpResponse(text, content_type='text/plain')
    return add_cache_tag(response, "lccn=%s" % lccn)

Example #3

0

Show file

File: browse.py Project: LibraryOfCongress/chronam

def page_ocr(request, lccn, date, edition, sequence):
    title, issue, page = _get_tip(lccn, date, edition, sequence)
    page_title = "%s, %s, %s" % (label(title), label(issue), label(page))
    crumbs = create_crumbs(title, issue, date, edition, page)
    host = request.get_host()
    text = get_page_text(page)
    response = render_to_response('page_text.html', dictionary=locals(),
                                  context_instance=RequestContext(request))
    return add_cache_tag(response, "lccn=%s" % lccn)

Example #4

0

Show file

def page_ocr(request, lccn, date, edition, sequence):
    title, issue, page = _get_tip(lccn, date, edition, sequence)
    page_title = "%s, %s, %s" % (label(title), label(issue), label(page))
    crumbs = create_crumbs(title, issue, date, edition, page)
    host = request.get_host()
    text = get_page_text(page)
    response = render_to_response('page_text.html',
                                  dictionary=locals(),
                                  context_instance=RequestContext(request))
    return add_cache_tag(response, "lccn=%s" % lccn)

Example #5

0

Show file

File: browse_tests.py Project: LibraryOfCongress/chronam

    def test_getting_text_from_solr_slovenia(self):
        """
        tests get_page_text() with batch batch_iune_oriole_ver01.
        First creates a page object 'page' with _get_tip()
        then uses it as an argument to get_page_text()
        """
        batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_iune_oriole_ver01')
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=True)
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.name, 'batch_iune_oriole_ver01')
        title, issue, page = _get_tip('sn83045377', '1906-03-01', 1, 1)
        text = get_page_text(page)
        self.assertIn("Od Mizo in dale", text[0])
        self.assertIn("To je preecj inoettii tobak! Marsi", text[0])

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_iune_oriole_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(Title.objects.get(lccn='sn83045377').has_issues, False)

Example #6

0

Show file

File: browse_tests.py Project: LibraryOfCongress/chronam

    def test_getting_text_from_solr_utah(self):
        """
        tests get_page_text() with batch batch_uuml_thys_ver01.
        First creates a page object 'page' with _get_tip()
        then uses it as an argument to get_page_text()
        """
        batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_uuml_thys_ver01')
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=True)
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.name, 'batch_uuml_thys_ver01')
        title, issue, page = _get_tip('sn83045396', '1911-09-17', 1, 1)
        text = get_page_text(page)
        self.assertIn("Uc nice at tlio slate fair track", text[0])
        self.assertIn("PAGES FIVE CENTS", text[0])
        self.assertIn('gBter ho had left the grounds that', text[0])

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_uuml_thys_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(Title.objects.get(lccn='sn83045396').has_issues, False)

Example #7

0

Show file

    def test_getting_text_from_solr_slovenia(self):
        """
        tests get_page_text() with batch batch_iune_oriole_ver01.
        First creates a page object 'page' with _get_tip()
        then uses it as an argument to get_page_text()
        """
        batch_dir = os.path.join(settings.BATCH_STORAGE,
                                 'batch_iune_oriole_ver01')
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=True)
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.name, 'batch_iune_oriole_ver01')
        title, issue, page = _get_tip('sn83045377', '1906-03-01', 1, 1)
        text = get_page_text(page)
        self.assertIn("Od Mizo in dale", text[0])
        self.assertIn("To je preecj inoettii tobak! Marsi", text[0])

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_iune_oriole_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(
            Title.objects.get(lccn='sn83045377').has_issues, False)

Example #8

0

Show file

    def test_getting_text_from_solr_utah(self):
        """
        tests get_page_text() with batch batch_uuml_thys_ver01.
        First creates a page object 'page' with _get_tip()
        then uses it as an argument to get_page_text()
        """
        batch_dir = os.path.join(settings.BATCH_STORAGE,
                                 'batch_uuml_thys_ver01')
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=True)
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.name, 'batch_uuml_thys_ver01')
        title, issue, page = _get_tip('sn83045396', '1911-09-17', 1, 1)
        text = get_page_text(page)
        self.assertIn("Uc nice at tlio slate fair track", text[0])
        self.assertIn("PAGES FIVE CENTS", text[0])
        self.assertIn('gBter ho had left the grounds that', text[0])

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_uuml_thys_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(
            Title.objects.get(lccn='sn83045396').has_issues, False)

Example #9

0

Show file

def page(request, lccn, date, edition, sequence):
    title, issue, page = _get_tip(lccn, date, edition, sequence)

    if not page.jp2_filename:
        notes = page.notes.filter(type="noteAboutReproduction")
        num_notes = notes.count()
        if num_notes >= 1:
            explanation = notes[0].text
        else:
            explanation = ""

    # see if the user came from search engine results and attempt to
    # highlight words from their query by redirecting to a url that
    # has the highlighted words in it
    try:
        words = _search_engine_words(request)
        words = '+'.join(words)
        if len(words) > 0:
            path_parts = dict(lccn=lccn,
                              date=date,
                              edition=edition,
                              sequence=sequence)
            url = '%s?%s#%s' % (urlresolvers.reverse('chronam_page_words',
                                                     kwargs=path_parts),
                                request.GET.urlencode(), words)
            response = HttpResponseRedirect(url)
            return add_cache_tag(response, "lccn=%s" % lccn)
    except Exception as exc:
        LOGGER.error(
            "Failed to add search highlighting based on the referred search engine query: %s",
            exc,
            exc_info=True)
        if settings.DEBUG:
            raise
        # else squish the exception so the page will still get
        # served up minus the highlights

    # Calculate the previous_issue_first_page. Note: it was decided
    # that we want to skip over issues with missing pages. See ticket
    # #383.
    _issue = issue
    while True:
        previous_issue_first_page = None
        _issue = _issue.previous
        if not _issue:
            break
        previous_issue_first_page = _issue.first_page
        if previous_issue_first_page:
            break

    # do the same as above but for next_issue this time.
    _issue = issue
    while True:
        next_issue_first_page = None
        _issue = _issue.next
        if not _issue:
            break
        next_issue_first_page = _issue.first_page
        if next_issue_first_page:
            break

    page_title = "%s, %s, %s" % (label(title), label(issue), label(page))
    page_head_heading = "%s, %s, %s" % (title.display_name, label(issue),
                                        label(page))
    page_head_subheading = label(title)
    crumbs = create_crumbs(title, issue, date, edition, page)

    filename = page.jp2_abs_filename
    if filename:
        try:
            im = os.path.getsize(filename)
            image_size = filesizeformat(im)
        except OSError:
            image_size = "Unknown"

    image_credit = issue.batch.awardee.name
    host = request.get_host()
    profile_uri = 'http://www.openarchives.org/ore/html/'

    template = "page.html"
    text = get_page_text(page)
    response = render_to_response(template,
                                  dictionary=locals(),
                                  context_instance=RequestContext(request))
    return add_cache_tag(response, "lccn=%s" % lccn)

Example #10

0

Show file

File: browse.py Project: LibraryOfCongress/chronam

def page(request, lccn, date, edition, sequence):
    title, issue, page = _get_tip(lccn, date, edition, sequence)

    if not page.jp2_filename:
        notes = page.notes.filter(type="noteAboutReproduction")
        num_notes = notes.count()
        if num_notes >= 1:
            explanation = notes[0].text
        else:
            explanation = ""

    # see if the user came from search engine results and attempt to
    # highlight words from their query by redirecting to a url that
    # has the highlighted words in it
    try:
        words = _search_engine_words(request)
        words = '+'.join(words)
        if len(words) > 0:
            path_parts = dict(lccn=lccn, date=date, edition=edition, sequence=sequence)
            url = '%s?%s#%s' % (urlresolvers.reverse('chronam_page_words',
                                                     kwargs=path_parts), request.GET.urlencode(), words)
            response = HttpResponseRedirect(url)
            return add_cache_tag(response, "lccn=%s" % lccn)
    except Exception as exc:
        LOGGER.error("Failed to add search highlighting based on the referred search engine query: %s",
                     exc, exc_info=True)
        if settings.DEBUG:
            raise
        # else squish the exception so the page will still get
        # served up minus the highlights

    # Calculate the previous_issue_first_page. Note: it was decided
    # that we want to skip over issues with missing pages. See ticket
    # #383.
    _issue = issue
    while True:
        previous_issue_first_page = None
        _issue = _issue.previous
        if not _issue:
            break
        previous_issue_first_page = _issue.first_page
        if previous_issue_first_page:
            break

    # do the same as above but for next_issue this time.
    _issue = issue
    while True:
        next_issue_first_page = None
        _issue = _issue.next
        if not _issue:
            break
        next_issue_first_page = _issue.first_page
        if next_issue_first_page:
            break

    page_title = "%s, %s, %s" % (label(title), label(issue), label(page))
    page_head_heading = "%s, %s, %s" % (title.display_name, label(issue), label(page))
    page_head_subheading = label(title)
    crumbs = create_crumbs(title, issue, date, edition, page)

    filename = page.jp2_abs_filename
    if filename:
        try:
            im = os.path.getsize(filename)
            image_size = filesizeformat(im)
        except OSError:
            image_size = "Unknown"

    image_credit = issue.batch.awardee.name
    host = request.get_host()
    profile_uri = 'http://www.openarchives.org/ore/html/'

    template = "page.html"
    text = get_page_text(page)
    response = render_to_response(template, dictionary=locals(),
                                  context_instance=RequestContext(request))
    return add_cache_tag(response, "lccn=%s" % lccn)