def test_ocr_dump(self): loader = BatchLoader() batch_dir = os.path.join(OcrDumpTests.batchDir, "batch_oru_testbatch_ver01") batch = loader.load_batch(batch_dir) self.assertEqual(batch.page_count, 27) t0 = datetime.datetime.now() dump = OcrDump.new_from_batch(batch) self.assertEqual(dump.name, "batch_oru_testbatch_ver01.tar.bz2") self.assertEqual(dump.path, os.path.join(OcrDumpTests.dumpDir, "batch_oru_testbatch_ver01.tar.bz2")) # make sure the sha1 looks good sha1 = hashlib.sha1() fh = open(dump.path) buff = fh.read() sha1.update(buff) self.assertEqual(dump.sha1, sha1.hexdigest()) # make sure there are the right number of things in the dump t = tarfile.open(dump.path, "r:bz2") members = t.getmembers() self.assertEqual(len(members), 27 * 2) # ocr xml and txt for each page self.assertEqual(members[0].size, 19) # mtime on files in the archive should be just after we # created the OcrDump object from the batch t1 = datetime.datetime.fromtimestamp(members[0].mtime) self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2)) # Make sure the batch is gone - mysql gets purged between tests, but # solr does not. This can't be done in teardown since the mysql db # is purged :( loader = BatchLoader() loader.purge_batch('batch_oru_testbatch_ver01')
def dump_ocr(batch_name): batch = Batch.objects.get(name=batch_name) try: if batch.ocr_dump: logger.info("ocr already generated for %s", batch) return except OcrDump.DoesNotExist: # as expected pass logger.info("starting to dump ocr for %s", batch) dump = OcrDump.new_from_batch(batch) logger.info("created ocr dump %s for %s", dump, batch)