def handle(self, *args, **options): if not os.path.isdir(settings.OCR_DUMP_STORAGE): os.makedirs(settings.OCR_DUMP_STORAGE) for batch in Batch.objects.filter(ocr_dump__isnull=True): dump = OcrDump.new_from_batch(batch) logging.info("created ocr dump file: %s" % dump)
def handle(self, *args, **options): overwrite = options['overwrite'] if not os.path.isdir(settings.OCR_DUMP_STORAGE): os.makedirs(settings.OCR_DUMP_STORAGE) for batch_name in args: batch = Batch.objects.get(name=batch_name) logging.info('Starting to dump OCR for batch %s', batch_name) if hasattr(batch, 'ocr_dump'): if overwrite: logging.info( 'Deleting existing dump file %s before recreating it', batch.ocr_dump.path) batch.ocr_dump.delete() else: logging.warning( 'Skipping batch %s because dump %s exists and --overwrite was not specified', batch_name, batch.ocr_dump.path, ) continue dump = OcrDump.new_from_batch(batch) logging.info('Created OCR dump for batch %s: %s', batch_name, dump)
def handle(self, batch_name, *args, **options): if not os.path.isdir(settings.OCR_DUMP_STORAGE): os.makedirs(settings.OCR_DUMP_STORAGE) batch = Batch.objects.get(name=batch_name) LOGGER.info("starting to dump ocr for %s", batch) dump = OcrDump.new_from_batch(batch) LOGGER.info("created ocr dump %s for %s", dump, batch)
def dump_ocr(batch): try: if batch.ocr_dump: logger.info("ocr already generated for %s", batch) return except OcrDump.DoesNotExist: # as expected pass logger.info("starting to dump ocr for %s", batch) dump = OcrDump.new_from_batch(batch) logger.info("created ocr dump %s for %s", dump, batch)
def test_new_dump(self): batch = Batch.objects.get(name="batch_uuml_thys_ver01") self.assertEqual(batch.page_count, 56) batch_size = 0 for dirpath, dirnames, filenames in os.walk(batch.path): for f in filenames: fp = os.path.join(dirpath, f) batch_size += os.path.getsize(fp) t0 = datetime.datetime.now() dump = OcrDump.new_from_batch(batch) self.assertEqual(dump.batch.name, "batch_uuml_thys_ver01") self.assertEqual(dump.name, "batch_uuml_thys_ver01.tar.bz2") self.assertEqual( dump.path, os.path.join(dumps_dir, "batch_uuml_thys_ver01.tar.bz2")) # make sure it was actually compressed self.assertGreater(batch_size, dump.size) # make sure the sha1 looks good sha1 = hashlib.sha1() fh = open(dump.path) while True: buff = fh.read(2**16) if not buff: break sha1.update(buff) self.assertEqual(dump.sha1, sha1.hexdigest()) # make sure there are the right number of things in the dump t = tarfile.open(dump.path, "r:bz2") members = t.getmembers() self.assertGreater(len(members), 1) # mtime on files in the archive should be just after we # created the OcrDump object from the batch t1 = datetime.datetime.fromtimestamp(members[0].mtime) self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2)) # when we delete the Batch, the OcrDump should be deleted # and so should the dump file on the filesystem path = dump.path batch.delete() self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual(OcrDump.objects.all().count(), 0) self.assertTrue(not os.path.isfile(path))
def test_new_dump(self): batch = Batch.objects.get(name="batch_uuml_thys_ver01") self.assertEqual(batch.page_count, 56) batch_size = 0 for dirpath, dirnames, filenames in os.walk(batch.path): for f in filenames: fp = os.path.join(dirpath, f) batch_size += os.path.getsize(fp) t0 = datetime.datetime.now() dump = OcrDump.new_from_batch(batch) self.assertEqual(dump.batch.name, "batch_uuml_thys_ver01") self.assertEqual(dump.name, "batch_uuml_thys_ver01.tar.bz2") self.assertEqual(dump.path, os.path.join(dumps_dir, "batch_uuml_thys_ver01.tar.bz2")) # make sure it was actually compressed self.assertGreater(batch_size, dump.size) # make sure the sha1 looks good sha1 = hashlib.sha1() fh = open(dump.path) while True: buff = fh.read(2**16) if not buff: break sha1.update(buff) self.assertEqual(dump.sha1, sha1.hexdigest()) # make sure there are the right number of things in the dump t = tarfile.open(dump.path, "r:bz2") members = t.getmembers() self.assertGreater(len(members), 1) # mtime on files in the archive should be just after we # created the OcrDump object from the batch t1 = datetime.datetime.fromtimestamp(members[0].mtime) self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2)) # when we delete the Batch, the OcrDump should be deleted # and so should the dump file on the filesystem path = dump.path batch.delete() self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual(OcrDump.objects.all().count(), 0) self.assertTrue(not os.path.isfile(path))
def test_new_dump(self): batch = Batch.objects.get(name="batch_dlc_jamaica_ver01") self.assertEqual(batch.page_count, 14) t0 = datetime.datetime.now() dump = OcrDump.new_from_batch(batch) self.assertEqual(dump.batch.name, "batch_dlc_jamaica_ver01") self.assertEqual(dump.name, "batch_dlc_jamaica_ver01.tar.bz2") self.assertEqual( dump.path, os.path.join(dumps_dir, "batch_dlc_jamaica_ver01.tar.bz2")) # size can actually vary based on the compression of the different dates # that are in the tarfile self.assertTrue(dump.size > 2000000) self.assertTrue(dump.size < 2871684) # make sure the sha1 looks good sha1 = hashlib.sha1() fh = open(dump.path) while True: buff = fh.read(2**16) if not buff: break sha1.update(buff) self.assertEqual(dump.sha1, sha1.hexdigest()) # make sure there are the right number of things in the dump t = tarfile.open(dump.path, "r:bz2") members = t.getmembers() self.assertEqual(len(members), 28) # ocr xml and txt for each page self.assertEqual(members[0].size, 29610) # mtime on files in the archive should be just after we # created the OcrDump object from the batch t1 = datetime.datetime.fromtimestamp(members[0].mtime) self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2)) # when we delete the Batch, the OcrDump should be deleted # and so should the dump file on the filesystem path = dump.path batch.delete() self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual(OcrDump.objects.all().count(), 0) self.assertTrue(not os.path.isfile(path))
def test_new_dump(self): batch = Batch.objects.get(name="batch_dlc_jamaica_ver01") self.assertEqual(batch.page_count, 14) t0 = datetime.datetime.now() dump = OcrDump.new_from_batch(batch) self.assertEqual(dump.batch.name, "batch_dlc_jamaica_ver01") self.assertEqual(dump.name, "part-000001.tar.bz2") self.assertEqual(dump.path, os.path.join(dumps_dir, "part-000001.tar.bz2")) # size can actually vary based on the compression of the different dates # that are in the tarfile self.assertTrue(dump.size > 2000000) self.assertTrue(dump.size < 2871684) # make sure the sha1 looks good sha1 = hashlib.sha1() fh = open(dump.path) while True: buff = fh.read(2 ** 16) if not buff: break sha1.update(buff) self.assertEqual(dump.sha1, sha1.hexdigest()) # make sure there are the right number of things in the dump t = tarfile.open(dump.path, "r:bz2") members = t.getmembers() self.assertEqual(len(members), 28) # ocr xml and txt for each page self.assertEqual(members[0].size, 29610) # mtime on files in the archive should be just after we # created the OcrDump object from the batch t1 = datetime.datetime.fromtimestamp(members[0].mtime) self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2)) # when we delete the Batch, the OcrDump should be deleted # and so should the dump file on the filesystem path = dump.path batch.delete() self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual(OcrDump.objects.all().count(), 0) self.assertTrue(not os.path.isfile(path))
def handle(self, *args, **options): overwrite = options['overwrite'] if not os.path.isdir(settings.OCR_DUMP_STORAGE): os.makedirs(settings.OCR_DUMP_STORAGE) for batch_name in args: batch = Batch.objects.get(name=batch_name) logging.info('Starting to dump OCR for batch %s', batch_name) if hasattr(batch, 'ocr_dump'): if overwrite: logging.info('Deleting existing dump file %s before recreating it', batch.ocr_dump.path) batch.ocr_dump.delete() else: logging.warning( 'Skipping batch %s because dump %s exists and --overwrite was not specified', batch_name, batch.ocr_dump.path, ) continue dump = OcrDump.new_from_batch(batch) logging.info('Created OCR dump for batch %s: %s', batch_name, dump)