Example #1
0
    def handle(self, batch_name, *args, **options):
        if len(args) != 0:
            raise CommandError('Usage is load_batch %s' % self.args)

        loader = BatchLoader(
            process_ocr=options['process_ocr'], process_coordinates=options['process_coordinates']
        )
        try:
            loader.load_batch(batch_name)
        except BatchLoaderException:
            LOGGER.exception("Unable to load batch %s", batch_name)
            raise CommandError("unable to load batch. check the load_batch log for clues")
Example #2
0
    def handle(self, batch_name, *args, **options):
        if len(args) != 0:
            raise CommandError('Usage is load_batch %s' % self.args)

        loader = BatchLoader(
            process_ocr=options['process_ocr'],
            process_coordinates=options['process_coordinates'])
        try:
            loader.load_batch(batch_name)
        except BatchLoaderException as e:
            LOGGER.exception(e)
            raise CommandError(
                "unable to load batch. check the load_batch log for clues")
Example #3
0
    def handle(self, batch_path, *args, **options):
        if len(args) != 0:
            raise CommandError("Usage is load_batch %s" % self.args)

        if not os.path.isdir(batch_path):
            raise CommandError("Path %s does not exist" % batch_path)

        batch_path = os.path.realpath(batch_path)

        loader = BatchLoader(
            process_ocr=options["process_ocr"],
            process_coordinates=options["process_coordinates"])

        try:
            loader.load_batch(batch_path)
        except Exception:
            LOGGER.exception("Unable to load batch from %s", batch_path)
Example #4
0
def load_batch(batch_dir, service_request=None, process_coordinates=True):
    try:
        batch_loader = BatchLoader(process_coordinates=process_coordinates)
        batch = batch_loader.load_batch(batch_dir)
        logger.info("loaded batch %s", batch)
        if service_request:
            logger.info("marking service request as complete")
            service_request.complete()

    except Exception as e:
        logger.exception("unable to load batch %s", batch_dir)
        if service_request:
            logger.info("marking service request as failed")
            service_request.fail(str(e))
Example #5
0
def load_batch(batch_dir, service_request=None, process_coordinates=True):
    try:
        batch_loader = BatchLoader(process_coordinates=process_coordinates)
        batch = batch_loader.load_batch(batch_dir)
        logger.info("loaded batch %s", batch)
        if service_request:
            logger.info("marking service request as complete")
            service_request.complete()

    except Exception, e:
        logger.exception("unable to load batch %s" % batch_dir)
        if service_request:
            logger.info("marking service request as failed")
            service_request.fail(str(e))
    def test_getting_text_from_solr_slovenia(self):
        """
        tests get_page_text() with batch batch_iune_oriole_ver01.
        First creates a page object 'page' with _get_tip()
        then uses it as an argument to get_page_text()
        """
        batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_iune_oriole_ver01')
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=True)
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.name, 'batch_iune_oriole_ver01')
        title, issue, page = _get_tip('sn83045377', '1906-03-01', 1, 1)
        text = get_page_text(page)
        self.assertIn("Od Mizo in dale", text[0])
        self.assertIn("To je preecj inoettii tobak! Marsi", text[0])

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_iune_oriole_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(Title.objects.get(lccn='sn83045377').has_issues, False)
    def test_getting_text_from_solr_utah(self):
        """
        tests get_page_text() with batch batch_uuml_thys_ver01.
        First creates a page object 'page' with _get_tip()
        then uses it as an argument to get_page_text()
        """
        batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_uuml_thys_ver01')
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=True)
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.name, 'batch_uuml_thys_ver01')
        title, issue, page = _get_tip('sn83045396', '1911-09-17', 1, 1)
        text = get_page_text(page)
        self.assertIn("Uc nice at tlio slate fair track", text[0])
        self.assertIn("PAGES FIVE CENTS", text[0])
        self.assertIn('gBter ho had left the grounds that', text[0])

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_uuml_thys_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(Title.objects.get(lccn='sn83045396').has_issues, False)
Example #8
0
    def test_getting_text_from_solr_slovenia(self):
        """
        tests get_page_text() with batch batch_iune_oriole_ver01.
        First creates a page object 'page' with _get_tip()
        then uses it as an argument to get_page_text()
        """
        batch_dir = os.path.join(settings.BATCH_STORAGE,
                                 'batch_iune_oriole_ver01')
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=True)
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.name, 'batch_iune_oriole_ver01')
        title, issue, page = _get_tip('sn83045377', '1906-03-01', 1, 1)
        text = get_page_text(page)
        self.assertIn("Od Mizo in dale", text[0])
        self.assertIn("To je preecj inoettii tobak! Marsi", text[0])

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_iune_oriole_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(
            Title.objects.get(lccn='sn83045377').has_issues, False)
Example #9
0
    def test_getting_text_from_solr_utah(self):
        """
        tests get_page_text() with batch batch_uuml_thys_ver01.
        First creates a page object 'page' with _get_tip()
        then uses it as an argument to get_page_text()
        """
        batch_dir = os.path.join(settings.BATCH_STORAGE,
                                 'batch_uuml_thys_ver01')
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=True)
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.name, 'batch_uuml_thys_ver01')
        title, issue, page = _get_tip('sn83045396', '1911-09-17', 1, 1)
        text = get_page_text(page)
        self.assertIn("Uc nice at tlio slate fair track", text[0])
        self.assertIn("PAGES FIVE CENTS", text[0])
        self.assertIn('gBter ho had left the grounds that', text[0])

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_uuml_thys_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(
            Title.objects.get(lccn='sn83045396').has_issues, False)
Example #10
0
    def test_load_batch(self):
        batch_dir = os.path.join(settings.BATCH_STORAGE,
                                 'batch_uuml_thys_ver01')
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=False)
        batch = loader.load_batch(batch_dir)
        self.assertTrue(isinstance(batch, Batch))
        self.assertEqual(batch.name, 'batch_uuml_thys_ver01')
        self.assertEqual(len(batch.issues.all()), 2)

        title = Title.objects.get(lccn='sn83045396')
        self.assertTrue(title.has_issues)

        issue = batch.issues.all()[0]
        self.assertEqual(issue.volume, '83')
        self.assertEqual(issue.number, '156')
        self.assertEqual(issue.edition, 1)
        self.assertEqual(issue.title.lccn, 'sn83045396')
        self.assertEqual(date.strftime(issue.date_issued, '%Y-%m-%d'),
                         '1911-09-17')
        self.assertEqual(len(issue.pages.all()), 56)

        page = issue.pages.all()[0]
        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.url, u'/lccn/sn83045396/1911-09-17/ed-1/seq-1/')

        note = page.notes.all()[1]
        self.assertEqual(note.type, "agencyResponsibleForReproduction")
        self.assertEqual(note.text, "uuml")

        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.tiff_filename,
                         'sn83045396/print/1911091701/0001.tif')
        self.assertEqual(page.jp2_filename,
                         'sn83045396/print/1911091701/0001.jp2')
        self.assertEqual(page.jp2_length, 8736)
        self.assertEqual(page.jp2_width, 6544)
        self.assertEqual(page.ocr_filename,
                         'sn83045396/print/1911091701/0001.xml')
        self.assertEqual(page.pdf_filename,
                         'sn83045396/print/1911091701/0001.pdf')

        # extract ocr data just for this page
        loader.process_ocr(page, index=False)
        self.assertTrue(page.ocr != None)
        self.assertTrue(len(page.ocr.text) > 0)

        p = Title.objects.get(lccn='sn83045396').issues.all()[0].pages.all()[0]
        self.assertTrue(p.ocr != None)

        # check that the solr_doc looks legit
        solr_doc = page.solr_doc
        self.assertEqual(solr_doc['id'],
                         '/lccn/sn83045396/1911-09-17/ed-1/seq-1/')
        self.assertEqual(solr_doc['type'], 'page')
        self.assertEqual(solr_doc['sequence'], 1)
        self.assertEqual(solr_doc['lccn'], 'sn83045396')
        self.assertEqual(solr_doc['title'], 'The Salt Lake tribune.')
        self.assertEqual(solr_doc['date'], '19110917')
        self.assertEqual(solr_doc['batch'], 'batch_uuml_thys_ver01')
        self.assertEqual(solr_doc['subject'], [
            u'Salt Lake City (Utah)--Newspapers.',
            u'Utah--Salt Lake City.--fast--(OCoLC)fst01205314'
        ])
        self.assertEqual(solr_doc['place'],
                         [u'Utah--Salt Lake--Salt Lake City'])
        self.assertEqual(solr_doc['note'], [
            u'Archived issues are available in digital format as part of the Library of Congress Chronicling America online collection.',
            u'Continues the numbering of: Salt Lake daily tribune.',
            u'Other eds.: Salt Lake tribune (Salt Lake City, Utah : Idaho ed.), 1954-1973, and: Salt Lake tribune (Salt Lake City, Utah : Metropolitan ed.), 1960-1972, and: Salt Lake tribune (Salt Lake City, Utah : State ed.), 1954-1974.',
            u'Publisher varies.',
            u'Semiweekly ed.: Salt Lake semi-weekly tribune, 1894-1902.',
            u'Weekly ed.: Salt Lake weekly tribune (Salt Lake City, Utah : 1902), 1902-< >.'
        ])
        self.assertTrue(not solr_doc.has_key('essay'))

        f = os.path.join(os.path.dirname(chronam.core.__file__), 'test-data',
                         'uuml_thys_ocr.txt')
        self.assertEqual(solr_doc['ocr_eng'], file(f).read().decode('utf-8'))

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_uuml_thys_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(
            Title.objects.get(lccn='sn83045396').has_issues, False)
    def test_load_batch(self):
        batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_uuml_thys_ver01')
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=False)
        batch = loader.load_batch(batch_dir)
        self.assertTrue(isinstance(batch, Batch))
        self.assertEqual(batch.name, 'batch_uuml_thys_ver01')
        self.assertEqual(len(batch.issues.all()), 2)

        title = Title.objects.get(lccn='sn83045396')
        self.assertTrue(title.has_issues)

        issue = batch.issues.all()[0]
        self.assertEqual(issue.volume, '83')
        self.assertEqual(issue.number, '156')
        self.assertEqual(issue.edition, 1)
        self.assertEqual(issue.title.lccn, 'sn83045396')
        self.assertEqual(date.strftime(issue.date_issued, '%Y-%m-%d'),
                         '1911-09-17')
        self.assertEqual(len(issue.pages.all()), 56)

        page = issue.pages.all()[0]
        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.url, u'/lccn/sn83045396/1911-09-17/ed-1/seq-1/')

        note = page.notes.all()[1]
        self.assertEqual(note.type, "agencyResponsibleForReproduction")
        self.assertEqual(note.text, "uuml")

        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.tiff_filename, 'sn83045396/print/1911091701/0001.tif')
        self.assertEqual(page.jp2_filename, 'sn83045396/print/1911091701/0001.jp2')
        self.assertEqual(page.jp2_length, 8736)
        self.assertEqual(page.jp2_width, 6544)
        self.assertEqual(page.ocr_filename, 'sn83045396/print/1911091701/0001.xml')
        self.assertEqual(page.pdf_filename, 'sn83045396/print/1911091701/0001.pdf')

        # extract ocr data just for this page
        loader.process_ocr(page)
        self.assertTrue(page.ocr is not None)
        self.assertGreater(len(page.lang_text), 0)

        p = Title.objects.get(lccn='sn83045396').issues.all()[0].pages.all()[0]
        self.assertTrue(p.ocr is not None)

        # check that the solr_doc looks legit
        solr_doc = page.solr_doc
        self.assertEqual(solr_doc['id'], '/lccn/sn83045396/1911-09-17/ed-1/seq-1/')
        self.assertEqual(solr_doc['type'], 'page')
        self.assertEqual(solr_doc['sequence'], 1)
        self.assertEqual(solr_doc['lccn'], 'sn83045396')
        self.assertEqual(solr_doc['title'], 'The Salt Lake tribune.')
        self.assertEqual(solr_doc['date'], '19110917')
        self.assertEqual(solr_doc['batch'], 'batch_uuml_thys_ver01')
        self.assertEqual(solr_doc['subject'], [
            u'Salt Lake City (Utah)--Newspapers.',
            u'Utah--Salt Lake City.--fast--(OCoLC)fst01205314'])
        self.assertEqual(solr_doc['place'], [
            u'Utah--Salt Lake--Salt Lake City'])
        self.assertEqual(solr_doc['note'], [
            u'Archived issues are available in digital format as part of the Library of Congress Chronicling America online collection.',
            u'Continues the numbering of: Salt Lake daily tribune.',
            u'Other eds.: Salt Lake tribune (Salt Lake City, Utah : Idaho ed.), 1954-1973, and: Salt Lake tribune (Salt Lake City, Utah : Metropolitan ed.), 1960-1972, and: Salt Lake tribune (Salt Lake City, Utah : State ed.), 1954-1974.',
            u'Publisher varies.',
            u'Semiweekly ed.: Salt Lake semi-weekly tribune, 1894-1902.',
            u'Weekly ed.: Salt Lake weekly tribune (Salt Lake City, Utah : 1902), 1902-< >.'])
        self.assertTrue('essay' not in solr_doc)

        f = os.path.join(os.path.dirname(chronam.core.__file__), 'test-data',
                         'uuml_thys_ocr.txt')
        self.assertEqual(solr_doc['ocr_eng'], file(f).read().decode('utf-8'))

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_uuml_thys_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(Title.objects.get(lccn='sn83045396').has_issues, False)
Example #12
0
    def test_load_batch(self):
        batch_dir = '/vol/ndnp/chronam/batches/dlc/batch_dlc_jamaica_ver01/'
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=False)
        batch = loader.load_batch(batch_dir)
        self.assertTrue(isinstance(batch, Batch))
        self.assertEqual(batch.name, 'batch_dlc_jamaica_ver01')
        self.assertEqual(len(batch.issues.all()), 304)

        title = Title.objects.get(lccn='sn83030214')
        self.assertTrue(title.has_issues)

        issue = batch.issues.all()[0]
        self.assertEqual(issue.volume, '63')
        self.assertEqual(issue.number, '20620')
        self.assertEqual(issue.edition, 1)
        self.assertEqual(issue.title.lccn, 'sn83030214')
        self.assertEqual(date.strftime(issue.date_issued, '%Y-%m-%d'),
                         '1903-05-01')
        self.assertEqual(len(issue.pages.all()), 14)

        page = issue.pages.all()[0]
        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.url, u'/lccn/sn83030214/1903-05-01/ed-1/seq-1/')

        note = page.notes.all()[1]
        self.assertEqual(note.type, "noteAboutReproduction")
        self.assertEqual(note.text, "Present")

        # extract ocr data just for this page
        loader.process_ocr(page, index=False)
        #self.assertEqual(page.number, 1)
        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.tiff_filename,
                         'sn83030214/00175042143/1903050101/0002.tif')
        self.assertEqual(page.jp2_filename,
                         'sn83030214/00175042143/1903050101/0002.jp2')
        self.assertEqual(page.jp2_length, 8898)
        self.assertEqual(page.jp2_width, 6520)
        self.assertEqual(page.ocr_filename,
                         'sn83030214/00175042143/1903050101/0002.xml')
        self.assertEqual(page.pdf_filename,
                         'sn83030214/00175042143/1903050101/0002.pdf')
        self.assertTrue(page.ocr != None)
        self.assertTrue(len(page.ocr.text) > 0)

        p = Title.objects.get(lccn='sn83030214').issues.all()[0].pages.all()[0]
        self.assertTrue(p.ocr != None)

        # check that the solr_doc looks legit
        solr_doc = page.solr_doc
        self.assertEqual(solr_doc['id'],
                         '/lccn/sn83030214/1903-05-01/ed-1/seq-1/')
        self.assertEqual(solr_doc['type'], 'page')
        self.assertEqual(solr_doc['sequence'], 1)
        self.assertEqual(solr_doc['lccn'], 'sn83030214')
        self.assertEqual(solr_doc['title'], 'New-York tribune.')
        self.assertEqual(solr_doc['date'], '19030501')
        self.assertEqual(solr_doc['batch'], 'batch_dlc_jamaica_ver01')
        self.assertEqual(solr_doc['subject'], [
            u'New York (N.Y.)--Newspapers.',
            u'New York County (N.Y.)--Newspapers.'
        ])
        self.assertEqual(solr_doc['place'], [
            u'New York--Brooklyn--New York City',
            u'New York--Queens--New York City'
        ])
        self.assertEqual(solr_doc['note'],
                         [u"I'll take Manhattan", u'The Big Apple'])
        self.assertTrue(not solr_doc.has_key('essay'))

        f = os.path.join(os.path.dirname(chronam.core.__file__), 'test-data',
                         'ocr.txt')
        self.assertEqual(solr_doc['ocr_eng'], file(f).read().decode('utf-8'))

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_dlc_jamaica_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(
            Title.objects.get(lccn='sn83030214').has_issues, False)
Example #13
0
    def test_load_batch(self):
        batch_dir = '/vol/ndnp/chronam/batches/dlc/batch_dlc_jamaica_ver01/'
        self.assertTrue(os.path.isdir(batch_dir))
        loader = BatchLoader(process_ocr=False)
        batch = loader.load_batch(batch_dir)
        self.assertTrue(isinstance(batch, Batch))
        self.assertEqual(batch.name, 'batch_dlc_jamaica_ver01')
        self.assertEqual(len(batch.issues.all()), 304)

        title = Title.objects.get(lccn = 'sn83030214')
        self.assertTrue(title.has_issues)

        issue = batch.issues.all()[0]
        self.assertEqual(issue.volume, '63')
        self.assertEqual(issue.number, '20620')
        self.assertEqual(issue.edition, 1)
        self.assertEqual(issue.title.lccn, 'sn83030214')
        self.assertEqual(date.strftime(issue.date_issued, '%Y-%m-%d'), 
            '1903-05-01')
        self.assertEqual(len(issue.pages.all()), 14)

        page = issue.pages.all()[0]
        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.url, u'/lccn/sn83030214/1903-05-01/ed-1/seq-1/')

        note = page.notes.all()[1]
        self.assertEqual(note.type, "noteAboutReproduction")
        self.assertEqual(note.text, "Present")

        # extract ocr data just for this page
        loader.process_ocr(page, index=False)
        #self.assertEqual(page.number, 1)
        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.tiff_filename, 'sn83030214/00175042143/1903050101/0002.tif')
        self.assertEqual(page.jp2_filename, 'sn83030214/00175042143/1903050101/0002.jp2')
        self.assertEqual(page.jp2_length, 8898)
        self.assertEqual(page.jp2_width, 6520)
        self.assertEqual(page.ocr_filename, 'sn83030214/00175042143/1903050101/0002.xml')
        self.assertEqual(page.pdf_filename, 'sn83030214/00175042143/1903050101/0002.pdf')
        self.assertTrue(page.ocr != None)
        self.assertTrue(len(page.ocr.text) > 0)

        p = Title.objects.get(lccn='sn83030214').issues.all()[0].pages.all()[0]
        self.assertTrue(p.ocr != None)

        # check that the solr_doc looks legit
        solr_doc = page.solr_doc
        self.assertEqual(solr_doc['id'], '/lccn/sn83030214/1903-05-01/ed-1/seq-1/')
        self.assertEqual(solr_doc['type'], 'page')
        self.assertEqual(solr_doc['sequence'], 1)
        self.assertEqual(solr_doc['lccn'], 'sn83030214')
        self.assertEqual(solr_doc['title'], 'New-York tribune.')
        self.assertEqual(solr_doc['date'], '19030501')
        self.assertEqual(solr_doc['batch'], 'batch_dlc_jamaica_ver01')
        self.assertEqual(solr_doc['subject'], [
            u'New York (N.Y.)--Newspapers.',
            u'New York County (N.Y.)--Newspapers.'])
        self.assertEqual(solr_doc['place'], [
            u'New York--Brooklyn--New York City', 
            u'New York--Queens--New York City'])
        self.assertEqual(solr_doc['note'], [
            u"I'll take Manhattan",
            u'The Big Apple'])
        self.assertTrue(not solr_doc.has_key('essay'))

        f = os.path.join(os.path.dirname(chronam.core.__file__), 'test-data', 
            'ocr.txt')
        self.assertEqual(solr_doc['ocr_eng'], file(f).read().decode('utf-8'))

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_dlc_jamaica_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(Title.objects.get(lccn='sn83030214').has_issues, False)