Example #1
0
    def handle(self, *args, **options):
        if not os.path.isdir(settings.OCR_DUMP_STORAGE):
            os.makedirs(settings.OCR_DUMP_STORAGE)

        for batch in Batch.objects.filter(ocr_dump__isnull=True):
            dump = OcrDump.new_from_batch(batch)
            logging.info("created ocr dump file: %s" % dump)
Example #2
0
    def handle(self, *args, **options):
        overwrite = options['overwrite']

        if not os.path.isdir(settings.OCR_DUMP_STORAGE):
            os.makedirs(settings.OCR_DUMP_STORAGE)

        for batch_name in args:
            batch = Batch.objects.get(name=batch_name)
            logging.info('Starting to dump OCR for batch %s', batch_name)

            if hasattr(batch, 'ocr_dump'):
                if overwrite:
                    logging.info(
                        'Deleting existing dump file %s before recreating it',
                        batch.ocr_dump.path)
                    batch.ocr_dump.delete()
                else:
                    logging.warning(
                        'Skipping batch %s because dump %s exists and --overwrite was not specified',
                        batch_name,
                        batch.ocr_dump.path,
                    )
                    continue

            dump = OcrDump.new_from_batch(batch)
            logging.info('Created OCR dump for batch %s: %s', batch_name, dump)
Example #3
0
    def handle(self, batch_name, *args, **options):
        if not os.path.isdir(settings.OCR_DUMP_STORAGE):
            os.makedirs(settings.OCR_DUMP_STORAGE)

        batch = Batch.objects.get(name=batch_name)
        LOGGER.info("starting to dump ocr for %s", batch)
        dump = OcrDump.new_from_batch(batch)
        LOGGER.info("created ocr dump %s for %s", dump, batch)
Example #4
0
def dump_ocr(batch):
    try:
        if batch.ocr_dump:
            logger.info("ocr already generated for %s", batch)
        return
    except OcrDump.DoesNotExist:
        # as expected
        pass

    logger.info("starting to dump ocr for %s", batch)
    dump = OcrDump.new_from_batch(batch)
    logger.info("created ocr dump %s for %s", dump, batch)
Example #5
0
def dump_ocr(batch):
    try:
        if batch.ocr_dump:
            logger.info("ocr already generated for %s", batch)
        return
    except OcrDump.DoesNotExist:
        # as expected
        pass

    logger.info("starting to dump ocr for %s", batch)
    dump = OcrDump.new_from_batch(batch)
    logger.info("created ocr dump %s for %s", dump, batch)
Example #6
0
    def test_new_dump(self):
        batch = Batch.objects.get(name="batch_uuml_thys_ver01")
        self.assertEqual(batch.page_count, 56)

        batch_size = 0
        for dirpath, dirnames, filenames in os.walk(batch.path):
            for f in filenames:
                fp = os.path.join(dirpath, f)
                batch_size += os.path.getsize(fp)

        t0 = datetime.datetime.now()
        dump = OcrDump.new_from_batch(batch)
        self.assertEqual(dump.batch.name, "batch_uuml_thys_ver01")
        self.assertEqual(dump.name, "batch_uuml_thys_ver01.tar.bz2")
        self.assertEqual(
            dump.path, os.path.join(dumps_dir,
                                    "batch_uuml_thys_ver01.tar.bz2"))
        # make sure it was actually compressed
        self.assertGreater(batch_size, dump.size)

        # make sure the sha1 looks good
        sha1 = hashlib.sha1()
        fh = open(dump.path)
        while True:
            buff = fh.read(2**16)
            if not buff:
                break
            sha1.update(buff)
        self.assertEqual(dump.sha1, sha1.hexdigest())

        # make sure there are the right number of things in the dump
        t = tarfile.open(dump.path, "r:bz2")
        members = t.getmembers()
        self.assertGreater(len(members), 1)

        # mtime on files in the archive should be just after we
        # created the OcrDump object from the batch
        t1 = datetime.datetime.fromtimestamp(members[0].mtime)
        self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2))

        # when we delete the Batch, the OcrDump should be deleted
        # and so should the dump file on the filesystem
        path = dump.path
        batch.delete()
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(OcrDump.objects.all().count(), 0)
        self.assertTrue(not os.path.isfile(path))
    def test_new_dump(self):
        batch = Batch.objects.get(name="batch_uuml_thys_ver01")
        self.assertEqual(batch.page_count, 56)

        batch_size = 0
        for dirpath, dirnames, filenames in os.walk(batch.path):
            for f in filenames:
                fp = os.path.join(dirpath, f)
                batch_size += os.path.getsize(fp)

        t0 = datetime.datetime.now()
        dump = OcrDump.new_from_batch(batch)
        self.assertEqual(dump.batch.name, "batch_uuml_thys_ver01")
        self.assertEqual(dump.name, "batch_uuml_thys_ver01.tar.bz2")
        self.assertEqual(dump.path, os.path.join(dumps_dir, "batch_uuml_thys_ver01.tar.bz2"))
        # make sure it was actually compressed
        self.assertGreater(batch_size, dump.size)

        # make sure the sha1 looks good
        sha1 = hashlib.sha1()
        fh = open(dump.path)
        while True:
            buff = fh.read(2**16)
            if not buff:
                break
            sha1.update(buff)
        self.assertEqual(dump.sha1, sha1.hexdigest())

        # make sure there are the right number of things in the dump
        t = tarfile.open(dump.path, "r:bz2")
        members = t.getmembers()
        self.assertGreater(len(members), 1)

        # mtime on files in the archive should be just after we
        # created the OcrDump object from the batch
        t1 = datetime.datetime.fromtimestamp(members[0].mtime)
        self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2))

        # when we delete the Batch, the OcrDump should be deleted
        # and so should the dump file on the filesystem
        path = dump.path
        batch.delete()
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(OcrDump.objects.all().count(), 0)
        self.assertTrue(not os.path.isfile(path))
    def test_new_dump(self):
        batch = Batch.objects.get(name="batch_dlc_jamaica_ver01")
        self.assertEqual(batch.page_count, 14)

        t0 = datetime.datetime.now()
        dump = OcrDump.new_from_batch(batch)
        self.assertEqual(dump.batch.name, "batch_dlc_jamaica_ver01")
        self.assertEqual(dump.name, "batch_dlc_jamaica_ver01.tar.bz2")
        self.assertEqual(
            dump.path,
            os.path.join(dumps_dir, "batch_dlc_jamaica_ver01.tar.bz2"))
        # size can actually vary based on the compression of the different dates
        # that are in the tarfile
        self.assertTrue(dump.size > 2000000)
        self.assertTrue(dump.size < 2871684)

        # make sure the sha1 looks good
        sha1 = hashlib.sha1()
        fh = open(dump.path)
        while True:
            buff = fh.read(2**16)
            if not buff: break
            sha1.update(buff)
        self.assertEqual(dump.sha1, sha1.hexdigest())

        # make sure there are the right number of things in the dump
        t = tarfile.open(dump.path, "r:bz2")
        members = t.getmembers()
        self.assertEqual(len(members), 28)  # ocr xml and txt for each page
        self.assertEqual(members[0].size, 29610)

        # mtime on files in the archive should be just after we
        # created the OcrDump object from the batch
        t1 = datetime.datetime.fromtimestamp(members[0].mtime)
        self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2))

        # when we delete the Batch, the OcrDump should be deleted
        # and so should the dump file on the filesystem
        path = dump.path
        batch.delete()
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(OcrDump.objects.all().count(), 0)
        self.assertTrue(not os.path.isfile(path))
Example #9
0
    def test_new_dump(self):
        batch = Batch.objects.get(name="batch_dlc_jamaica_ver01")
        self.assertEqual(batch.page_count, 14)

        t0 = datetime.datetime.now()
        dump = OcrDump.new_from_batch(batch)
        self.assertEqual(dump.batch.name, "batch_dlc_jamaica_ver01")
        self.assertEqual(dump.name, "part-000001.tar.bz2")
        self.assertEqual(dump.path, os.path.join(dumps_dir, "part-000001.tar.bz2"))
        # size can actually vary based on the compression of the different dates
        # that are in the tarfile
        self.assertTrue(dump.size > 2000000)
        self.assertTrue(dump.size < 2871684)

        # make sure the sha1 looks good
        sha1 = hashlib.sha1()
        fh = open(dump.path)
        while True:
            buff = fh.read(2 ** 16)
            if not buff:
                break
            sha1.update(buff)
        self.assertEqual(dump.sha1, sha1.hexdigest())

        # make sure there are the right number of things in the dump
        t = tarfile.open(dump.path, "r:bz2")
        members = t.getmembers()
        self.assertEqual(len(members), 28)  # ocr xml and txt for each page
        self.assertEqual(members[0].size, 29610)

        # mtime on files in the archive should be just after we
        # created the OcrDump object from the batch
        t1 = datetime.datetime.fromtimestamp(members[0].mtime)
        self.assertTrue(t1 - t0 < datetime.timedelta(seconds=2))

        # when we delete the Batch, the OcrDump should be deleted
        # and so should the dump file on the filesystem
        path = dump.path
        batch.delete()
        self.assertEqual(Batch.objects.all().count(), 0)
        self.assertEqual(OcrDump.objects.all().count(), 0)
        self.assertTrue(not os.path.isfile(path))
    def handle(self, *args, **options):
        overwrite = options['overwrite']

        if not os.path.isdir(settings.OCR_DUMP_STORAGE):
            os.makedirs(settings.OCR_DUMP_STORAGE)

        for batch_name in args:
            batch = Batch.objects.get(name=batch_name)
            logging.info('Starting to dump OCR for batch %s', batch_name)

            if hasattr(batch, 'ocr_dump'):
                if overwrite:
                    logging.info('Deleting existing dump file %s before recreating it', batch.ocr_dump.path)
                    batch.ocr_dump.delete()
                else:
                    logging.warning(
                        'Skipping batch %s because dump %s exists and --overwrite was not specified',
                        batch_name,
                        batch.ocr_dump.path,
                    )
                    continue

            dump = OcrDump.new_from_batch(batch)
            logging.info('Created OCR dump for batch %s: %s', batch_name, dump)