Beispiel #1
0
    def test_ocr_dump(self):
        loader = BatchLoader()
        batch_dir = os.path.join(OcrDumpTests.batchDir, "batch_oru_testbatch_ver01")
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.page_count, 27)

        t0 = datetime.datetime.now()

        dump = OcrDump.new_from_batch(batch)
        self.assertEqual(dump.name, "batch_oru_testbatch_ver01.tar.bz2")
        self.assertEqual(dump.path, os.path.join(OcrDumpTests.dumpDir, "batch_oru_testbatch_ver01.tar.bz2"))

        # make sure the sha1 looks good
        sha1 = hashlib.sha1()
        fh = open(dump.path)
        buff = fh.read()
        sha1.update(buff)
        self.assertEqual(dump.sha1, sha1.hexdigest())

        # make sure there are the right number of things in the dump
        t = tarfile.open(dump.path, "r:bz2")
        members = t.getmembers()
        self.assertEqual(len(members), 27 * 2) # ocr xml and txt for each page
        self.assertEqual(members[0].size, 19)

        # mtime on files in the archive should be just after we
        # created the OcrDump object from the batch
        t1 = datetime.datetime.fromtimestamp(members[0].mtime)
        self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2))

        # Make sure the batch is gone - mysql gets purged between tests, but
        # solr does not.  This can't be done in teardown since the mysql db
        # is purged :(
        loader = BatchLoader()
        loader.purge_batch('batch_oru_testbatch_ver01')
Beispiel #2
0
    def handle(self, batch_name, *args, **options):
        if len(args)!=0:
            raise CommandError('Usage is load_batch %s' % self.args)

        loader = BatchLoader(process_ocr=options['process_ocr'],
                             process_coordinates=options['process_coordinates'])
        try:
            batch = loader.load_batch(batch_name)
        except BatchLoaderException, e:
            LOGGER.exception(e)
            raise CommandError("unable to load batch. check the load_batch log for clues")
Beispiel #3
0
    def handle(self, batch_name, *args, **options):
        if len(args) != 0:
            raise CommandError('Usage is load_batch %s' % self.args)

        loader = BatchLoader(
            process_ocr=options['process_ocr'],
            process_coordinates=options['process_coordinates'])
        try:
            batch = loader.load_batch(batch_name)
        except BatchLoaderException, e:
            LOGGER.exception(e)
            raise CommandError(
                "unable to load batch. check the load_batch log for clues")
Beispiel #4
0
    def handle(self, batch_path, *args, **options):
        if len(args) != 0:
            raise CommandError('Usage: load_batch %s' % self.args)

        if not os.path.exists(batch_path):
            raise CommandError(
                'Batch path does not exist: {}'.format(batch_path))

        loader = BatchLoader(
            process_ocr=options['process_ocr'],
            process_coordinates=options['process_coordinates'])
        try:
            batch = loader.load_batch(batch_path)
        except BatchLoaderException as e:
            LOGGER.exception(e)
            raise CommandError("Batch load failed. See logs/load_batch_#.log")
Beispiel #5
0
    def handle(self, batch_name=None, *args, **options):
        if len(args)!=0:
            raise CommandError('Usage is purge_batch %s' % self.args)

        loader = BatchLoader()
        try:
            log.info("purging batch '%s'", batch_name)
            loader.purge_batch(batch_name)
            if options['optimize']:
                log.info("optimizing solr")
                solr = SolrConnection(settings.SOLR)
                solr.optimize()
                log.info("optimizing MySQL OCR table")
                cursor = connection.cursor()
                cursor.execute("OPTIMIZE TABLE core_ocr")
                log.info("finished optimizing")
        except BatchLoaderException, e:
            log.exception(e)
            raise CommandError("unable to purge batch. check the purge_batch log for clues")
Beispiel #6
0
    def handle(self, batch_location=None, *args, **options):
        if len(args)!=0:
            raise CommandError('Usage is purge_batch %s' % self.args)

        loader = BatchLoader()
        try:
            log.info("purging batch %s", batch_location)
            loader.purge_batch(batch_location)
            if options['optimize']:
                log.info("optimizing solr")
                solr = SolrConnection(settings.SOLR)
                solr.optimize()
                log.info("optimizing MySQL OCR table")
                cursor = connection.cursor()
                cursor.execute("OPTIMIZE TABLE core_ocr")
                log.info("finished optimizing")
        except BatchLoaderException, e:
            log.exception(e)
            raise CommandError("unable to purge batch. check the purge_batch log for clues")
Beispiel #7
0
    def test_ocr_dump(self):
        loader = BatchLoader()
        batch_dir = os.path.join(OcrDumpTests.batchDir,
                                 "batch_oru_testbatch_ver01")
        batch = loader.load_batch(batch_dir)
        self.assertEqual(batch.page_count, 27)

        t0 = timezone.now()

        dump = OcrDump.new_from_batch(batch)
        self.assertEqual(dump.name, "batch_oru_testbatch_ver01.tar.bz2")
        self.assertEqual(
            dump.path,
            os.path.join(OcrDumpTests.dumpDir,
                         "batch_oru_testbatch_ver01.tar.bz2"))

        # make sure the sha1 looks good
        sha1 = hashlib.sha1()
        fh = open(dump.path, "rb")
        buff = fh.read()
        sha1.update(buff)
        self.assertEqual(dump.sha1, sha1.hexdigest())

        # make sure there are the right number of things in the dump
        t = tarfile.open(dump.path, "r:bz2")
        members = t.getmembers()
        self.assertEqual(len(members), 27 * 2)  # ocr xml and txt for each page
        self.assertEqual(members[0].size, 19)

        # mtime on files in the archive should be just after we
        # created the OcrDump object from the batch
        t1 = datetime.datetime.fromtimestamp(members[0].mtime)
        t1 = timezone.make_aware(t1)
        self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2))

        # Make sure the batch is gone - mysql gets purged between tests, but
        # solr does not.  This can't be done in teardown since the mysql db
        # is purged :(
        loader = BatchLoader()
        loader.purge_batch('batch_oru_testbatch_ver01')
Beispiel #8
0
    def test_load_batch(self):
        # Extract mini-batch tarball to /tmp somewhere
        tarpath = os.path.join(os.path.dirname(core.__file__), 'test-data', 'testbatch.tgz')
        tar = tarfile.open(tarpath)
        tar.extractall(path = BatchLoaderTest.batchDir)
        tar.close()
        settings.BATCH_STORAGE = BatchLoaderTest.batchDir

        batch_dir = os.path.join(BatchLoaderTest.batchDir, "batch_oru_testbatch_ver01")

        loader = BatchLoader(process_ocr=False)
        batch = loader.load_batch(batch_dir)
        self.assertTrue(isinstance(batch, Batch))
        self.assertEqual(batch.name, 'batch_oru_testbatch_ver01')
        self.assertEqual(len(batch.issues.all()), 4)

        title = Title.objects.get(lccn = 'sn83030214')
        self.assertTrue(title.has_issues)

        issue = batch.issues.all()[0]
        self.assertEqual(issue.volume, '1')
        self.assertEqual(issue.number, '1')
        self.assertEqual(issue.edition, 1)
        self.assertEqual(issue.title.lccn, 'sn83030214')
        self.assertEqual(issue.date_issued.strftime('%Y-%m-%d'), '1999-06-15')
        self.assertEqual(len(issue.pages.all()), 15)

        page = issue.pages.all()[0]
        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.url, '/lccn/sn83030214/1999-06-15/ed-1/seq-1/')

        notes = page.notes.order_by("type").all()
        self.assertEqual(len(notes), 2)
        note = page.notes.all()[0]
        self.assertEqual(note.type, "noteAboutReproduction")
        self.assertEqual(note.text, "Present")
        note = page.notes.all()[1]
        self.assertEqual(note.type, "agencyResponsibleForReproduction")
        self.assertEqual(note.text, "oru")

        # Validate page 1's metadata
        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.jp2_filename, 'sn83030214/print/1999061501/0001.jp2')
        self.assertEqual(page.jp2_length, 411)
        self.assertEqual(page.jp2_width, 411)
        self.assertEqual(page.ocr_filename, 'sn83030214/print/1999061501/0001.xml')
        self.assertEqual(page.pdf_filename, 'sn83030214/print/1999061501/0001.pdf')

        # extract ocr data just for this page
        loader.process_ocr(page, index=False)
        self.assertTrue(page.ocr != None)
        self.assertTrue(len(page.ocr.text) > 0)

        p = Title.objects.get(lccn='sn83030214').issues.all()[0].pages.all()[0]
        self.assertTrue(p.ocr != None)

        # check that the solr_doc looks legit
        solr_doc = page.solr_doc
        self.assertEqual(solr_doc['id'], '/lccn/sn83030214/1999-06-15/ed-1/seq-1/')
        self.assertEqual(solr_doc['type'], 'page')
        self.assertEqual(solr_doc['sequence'], 1)
        self.assertEqual(solr_doc['lccn'], 'sn83030214')
        self.assertEqual(solr_doc['title'], 'New-York tribune.')
        self.assertEqual(solr_doc['date'], '19990615')
        self.assertEqual(solr_doc['batch'], 'batch_oru_testbatch_ver01')
        self.assertEqual(solr_doc['subject'], [
            'New York (N.Y.)--Newspapers.',
            'New York County (N.Y.)--Newspapers.'])
        self.assertEqual(solr_doc['place'], [
            'New York--Brooklyn--New York City', 
            'New York--Queens--New York City'])
        self.assertEqual(solr_doc['note'], [
            "I'll take Manhattan",
            'The Big Apple'])
        self.assertTrue('essay' not in solr_doc)
        self.assertEqual(solr_doc['ocr_eng'], 'LCCNsn83030214Page1')

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_oru_testbatch_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(Title.objects.get(lccn='sn83030214').has_issues, False)
Beispiel #9
0
    def test_load_batch(self):
        # Extract mini-batch tarball to /tmp somewhere
        tarpath = os.path.join(os.path.dirname(core.__file__), 'test-data', 'testbatch.tgz')
        tar = tarfile.open(tarpath)
        tar.extractall(path = BatchLoaderTest.batchDir)
        tar.close()
        settings.BATCH_STORAGE = BatchLoaderTest.batchDir

        batch_dir = os.path.join(BatchLoaderTest.batchDir, "batch_oru_testbatch_ver01")

        loader = BatchLoader(process_ocr=False)
        batch = loader.load_batch(batch_dir)
        self.assertTrue(isinstance(batch, Batch))
        self.assertEqual(batch.name, 'batch_oru_testbatch_ver01')
        self.assertEqual(len(batch.issues.all()), 4)

        title = Title.objects.get(lccn = 'sn83030214')
        self.assertTrue(title.has_issues)

        issue = batch.issues.all()[0]
        self.assertEqual(issue.volume, '1')
        self.assertEqual(issue.number, '1')
        self.assertEqual(issue.edition, 1)
        self.assertEqual(issue.title.lccn, 'sn83030214')
        self.assertEqual(date.strftime(issue.date_issued, '%Y-%m-%d'), '1999-06-15')
        self.assertEqual(len(issue.pages.all()), 15)

        page = issue.pages.all()[0]
        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.url, u'/lccn/sn83030214/1999-06-15/ed-1/seq-1/')

        note = page.notes.all()[1]
        self.assertEqual(note.type, "noteAboutReproduction")
        self.assertEqual(note.text, "Present")

        # Validate page 1's metadata
        self.assertEqual(page.sequence, 1)
        self.assertEqual(page.jp2_filename, 'sn83030214/print/1999061501/0001.jp2')
        self.assertEqual(page.jp2_length, 411)
        self.assertEqual(page.jp2_width, 411)
        self.assertEqual(page.ocr_filename, 'sn83030214/print/1999061501/0001.xml')
        self.assertEqual(page.pdf_filename, 'sn83030214/print/1999061501/0001.pdf')

        # extract ocr data just for this page
        loader.process_ocr(page, index=False)
        self.assertTrue(page.ocr != None)
        self.assertTrue(len(page.ocr.text) > 0)

        p = Title.objects.get(lccn='sn83030214').issues.all()[0].pages.all()[0]
        self.assertTrue(p.ocr != None)

        # check that the solr_doc looks legit
        solr_doc = page.solr_doc
        self.assertEqual(solr_doc['id'], '/lccn/sn83030214/1999-06-15/ed-1/seq-1/')
        self.assertEqual(solr_doc['type'], 'page')
        self.assertEqual(solr_doc['sequence'], 1)
        self.assertEqual(solr_doc['lccn'], 'sn83030214')
        self.assertEqual(solr_doc['title'], 'New-York tribune.')
        self.assertEqual(solr_doc['date'], '19990615')
        self.assertEqual(solr_doc['batch'], 'batch_oru_testbatch_ver01')
        self.assertEqual(solr_doc['subject'], [
            u'New York (N.Y.)--Newspapers.',
            u'New York County (N.Y.)--Newspapers.'])
        self.assertEqual(solr_doc['place'], [
            u'New York--Brooklyn--New York City', 
            u'New York--Queens--New York City'])
        self.assertEqual(solr_doc['note'], [
            u"I'll take Manhattan",
            u'The Big Apple'])
        self.assertTrue(not solr_doc.has_key('essay'))
        self.assertEqual(solr_doc['ocr_eng'], 'LCCNsn83030214Page1')

        # purge the batch and make sure it's gone from the db
        loader.purge_batch('batch_oru_testbatch_ver01')
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(Title.objects.get(lccn='sn83030214').has_issues, False)