def process_coordinates(batch_dir): try: batch_loader = BatchLoader() batch_loader.process_coordinates(batch_dir) logger.info("processed batch %s", batch_dir) except Exception, e: logger.exception("unable to process batch %s" % batch_dir)
def process_coordinates(batch_dir): try: batch_loader = BatchLoader() batch_loader.process_coordinates(batch_dir) logger.info("processed batch %s", batch_dir) except Exception: logger.exception("unable to process batch %s", batch_dir)
def handle(self, batch_name=None, *args, **options): if len(args) != 0: raise CommandError("Usage is purge_batch %s" % self.args) loader = BatchLoader() try: LOGGER.info("purging batch %s", batch_name) loader.purge_batch(batch_name) except BatchLoaderException: LOGGER.exception("Unable to purge batch %s", batch_name) raise CommandError("unable to purge batch. check the log for clues")
def handle(self, batch_location=None, *args, **options): if len(args)!=0: raise CommandError('Usage is purge_batch %s' % self.args) loader = BatchLoader() try: log.info("purging batch %s", batch_location) loader.purge_batch(batch_location) except BatchLoaderException, e: log.exception(e) raise CommandError("unable to purge batch. check the purge_batch log for clues")
def handle(self, batch_name, *args, **options): if len(args)!=0: raise CommandError('Usage is load_batch %s' % self.args) loader = BatchLoader(process_ocr=options['process_ocr'], process_coordinates=options['process_coordinates']) try: batch = loader.load_batch(batch_name) except BatchLoaderException, e: LOGGER.exception(e) raise CommandError("unable to load batch. check the load_batch log for clues")
def handle(self, batch_location=None, *args, **options): if len(args) != 0: raise CommandError('Usage is purge_batch %s' % self.args) loader = BatchLoader() try: log.info("purging batch %s", batch_location) loader.purge_batch(batch_location) except BatchLoaderException, e: log.exception(e) raise CommandError( "unable to purge batch. check the purge_batch log for clues")
def load_batch(batch_dir, service_request=None, process_coordinates=True): try: batch_loader = BatchLoader(process_coordinates=process_coordinates) batch = batch_loader.load_batch(batch_dir) logger.info("loaded batch %s", batch) if service_request: logger.info("marking service request as complete") service_request.complete() except Exception as e: logger.exception("unable to load batch %s", batch_dir) if service_request: logger.info("marking service request as failed") service_request.fail(str(e))
def load_batch(batch_dir, service_request=None, process_coordinates=True): try: batch_loader = BatchLoader(process_coordinates=process_coordinates) batch = batch_loader.load_batch(batch_dir) logger.info("loaded batch %s", batch) if service_request: logger.info("marking service request as complete") service_request.complete() except Exception, e: logger.exception("unable to load batch %s" % batch_dir) if service_request: logger.info("marking service request as failed") service_request.fail(str(e))
def handle(self, batch_path, *args, **options): if len(args) != 0: raise CommandError("Usage is load_batch %s" % self.args) if not os.path.isdir(batch_path): raise CommandError("Path %s does not exist" % batch_path) batch_path = os.path.realpath(batch_path) loader = BatchLoader( process_ocr=options["process_ocr"], process_coordinates=options["process_coordinates"]) try: loader.load_batch(batch_path) except Exception: LOGGER.exception("Unable to load batch from %s", batch_path)
def handle(self, batch_location=None, *args, **options): if len(args)!=0: raise CommandError('Usage is purge_batch %s' % self.args) loader = BatchLoader() try: log.info("purging batch %s", batch_location) loader.purge_batch(batch_location) if options['optimize']: log.info("optimizing solr") solr = SolrConnection(settings.SOLR) solr.optimize() log.info("optimizing MySQL OCR table") cursor = connection.cursor() cursor.execute("OPTIMIZE TABLE core_ocr") log.info("finished optimizing") except BatchLoaderException, e: log.exception(e) raise CommandError("unable to purge batch. check the purge_batch log for clues")
def test_getting_text_from_solr_slovenia(self): """ tests get_page_text() with batch batch_iune_oriole_ver01. First creates a page object 'page' with _get_tip() then uses it as an argument to get_page_text() """ batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_iune_oriole_ver01') self.assertTrue(os.path.isdir(batch_dir)) loader = BatchLoader(process_ocr=True) batch = loader.load_batch(batch_dir) self.assertEqual(batch.name, 'batch_iune_oriole_ver01') title, issue, page = _get_tip('sn83045377', '1906-03-01', 1, 1) text = get_page_text(page) self.assertIn("Od Mizo in dale", text[0]) self.assertIn("To je preecj inoettii tobak! Marsi", text[0]) # purge the batch and make sure it's gone from the db loader.purge_batch('batch_iune_oriole_ver01') self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual(Title.objects.get(lccn='sn83045377').has_issues, False)
def handle(self, batch_location=None, *args, **options): if len(args) != 0: raise CommandError('Usage is purge_batch %s' % self.args) loader = BatchLoader() try: LOGGER.info("purging batch %s", batch_location) loader.purge_batch(batch_location) if options['optimize']: LOGGER.info("optimizing solr") solr = SolrConnection(settings.SOLR) solr.optimize() LOGGER.info("optimizing MySQL OCR table") cursor = connection.cursor() cursor.execute("OPTIMIZE TABLE core_ocr") LOGGER.info("finished optimizing") except BatchLoaderException as e: LOGGER.exception(e) raise CommandError( "unable to purge batch. check the log for clues")
def test_getting_text_from_solr_utah(self): """ tests get_page_text() with batch batch_uuml_thys_ver01. First creates a page object 'page' with _get_tip() then uses it as an argument to get_page_text() """ batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_uuml_thys_ver01') self.assertTrue(os.path.isdir(batch_dir)) loader = BatchLoader(process_ocr=True) batch = loader.load_batch(batch_dir) self.assertEqual(batch.name, 'batch_uuml_thys_ver01') title, issue, page = _get_tip('sn83045396', '1911-09-17', 1, 1) text = get_page_text(page) self.assertIn("Uc nice at tlio slate fair track", text[0]) self.assertIn("PAGES FIVE CENTS", text[0]) self.assertIn('gBter ho had left the grounds that', text[0]) # purge the batch and make sure it's gone from the db loader.purge_batch('batch_uuml_thys_ver01') self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual(Title.objects.get(lccn='sn83045396').has_issues, False)
def test_getting_text_from_solr_slovenia(self): """ tests get_page_text() with batch batch_iune_oriole_ver01. First creates a page object 'page' with _get_tip() then uses it as an argument to get_page_text() """ batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_iune_oriole_ver01') self.assertTrue(os.path.isdir(batch_dir)) loader = BatchLoader(process_ocr=True) batch = loader.load_batch(batch_dir) self.assertEqual(batch.name, 'batch_iune_oriole_ver01') title, issue, page = _get_tip('sn83045377', '1906-03-01', 1, 1) text = get_page_text(page) self.assertIn("Od Mizo in dale", text[0]) self.assertIn("To je preecj inoettii tobak! Marsi", text[0]) # purge the batch and make sure it's gone from the db loader.purge_batch('batch_iune_oriole_ver01') self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual( Title.objects.get(lccn='sn83045377').has_issues, False)
def test_getting_text_from_solr_utah(self): """ tests get_page_text() with batch batch_uuml_thys_ver01. First creates a page object 'page' with _get_tip() then uses it as an argument to get_page_text() """ batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_uuml_thys_ver01') self.assertTrue(os.path.isdir(batch_dir)) loader = BatchLoader(process_ocr=True) batch = loader.load_batch(batch_dir) self.assertEqual(batch.name, 'batch_uuml_thys_ver01') title, issue, page = _get_tip('sn83045396', '1911-09-17', 1, 1) text = get_page_text(page) self.assertIn("Uc nice at tlio slate fair track", text[0]) self.assertIn("PAGES FIVE CENTS", text[0]) self.assertIn('gBter ho had left the grounds that', text[0]) # purge the batch and make sure it's gone from the db loader.purge_batch('batch_uuml_thys_ver01') self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual( Title.objects.get(lccn='sn83045396').has_issues, False)
def test_load_batch(self): batch_dir = '/vol/ndnp/chronam/batches/dlc/batch_dlc_jamaica_ver01/' self.assertTrue(os.path.isdir(batch_dir)) loader = BatchLoader(process_ocr=False) batch = loader.load_batch(batch_dir) self.assertTrue(isinstance(batch, Batch)) self.assertEqual(batch.name, 'batch_dlc_jamaica_ver01') self.assertEqual(len(batch.issues.all()), 304) title = Title.objects.get(lccn = 'sn83030214') self.assertTrue(title.has_issues) issue = batch.issues.all()[0] self.assertEqual(issue.volume, '63') self.assertEqual(issue.number, '20620') self.assertEqual(issue.edition, 1) self.assertEqual(issue.title.lccn, 'sn83030214') self.assertEqual(date.strftime(issue.date_issued, '%Y-%m-%d'), '1903-05-01') self.assertEqual(len(issue.pages.all()), 14) page = issue.pages.all()[0] self.assertEqual(page.sequence, 1) self.assertEqual(page.url, u'/lccn/sn83030214/1903-05-01/ed-1/seq-1/') note = page.notes.all()[1] self.assertEqual(note.type, "noteAboutReproduction") self.assertEqual(note.text, "Present") # extract ocr data just for this page loader.process_ocr(page, index=False) #self.assertEqual(page.number, 1) self.assertEqual(page.sequence, 1) self.assertEqual(page.tiff_filename, 'sn83030214/00175042143/1903050101/0002.tif') self.assertEqual(page.jp2_filename, 'sn83030214/00175042143/1903050101/0002.jp2') self.assertEqual(page.jp2_length, 8898) self.assertEqual(page.jp2_width, 6520) self.assertEqual(page.ocr_filename, 'sn83030214/00175042143/1903050101/0002.xml') self.assertEqual(page.pdf_filename, 'sn83030214/00175042143/1903050101/0002.pdf') self.assertTrue(page.ocr != None) self.assertTrue(len(page.ocr.text) > 0) p = Title.objects.get(lccn='sn83030214').issues.all()[0].pages.all()[0] self.assertTrue(p.ocr != None) # check that the solr_doc looks legit solr_doc = page.solr_doc self.assertEqual(solr_doc['id'], '/lccn/sn83030214/1903-05-01/ed-1/seq-1/') self.assertEqual(solr_doc['type'], 'page') self.assertEqual(solr_doc['sequence'], 1) self.assertEqual(solr_doc['lccn'], 'sn83030214') self.assertEqual(solr_doc['title'], 'New-York tribune.') self.assertEqual(solr_doc['date'], '19030501') self.assertEqual(solr_doc['batch'], 'batch_dlc_jamaica_ver01') self.assertEqual(solr_doc['subject'], [ u'New York (N.Y.)--Newspapers.', u'New York County (N.Y.)--Newspapers.']) self.assertEqual(solr_doc['place'], [ u'New York--Brooklyn--New York City', u'New York--Queens--New York City']) self.assertEqual(solr_doc['note'], [ u"I'll take Manhattan", u'The Big Apple']) self.assertTrue(not solr_doc.has_key('essay')) f = os.path.join(os.path.dirname(chronam.core.__file__), 'test-data', 'ocr.txt') self.assertEqual(solr_doc['ocr_eng'], file(f).read().decode('utf-8')) # purge the batch and make sure it's gone from the db loader.purge_batch('batch_dlc_jamaica_ver01') self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual(Title.objects.get(lccn='sn83030214').has_issues, False)
def test_load_batch(self): batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_uuml_thys_ver01') self.assertTrue(os.path.isdir(batch_dir)) loader = BatchLoader(process_ocr=False) batch = loader.load_batch(batch_dir) self.assertTrue(isinstance(batch, Batch)) self.assertEqual(batch.name, 'batch_uuml_thys_ver01') self.assertEqual(len(batch.issues.all()), 2) title = Title.objects.get(lccn='sn83045396') self.assertTrue(title.has_issues) issue = batch.issues.all()[0] self.assertEqual(issue.volume, '83') self.assertEqual(issue.number, '156') self.assertEqual(issue.edition, 1) self.assertEqual(issue.title.lccn, 'sn83045396') self.assertEqual(date.strftime(issue.date_issued, '%Y-%m-%d'), '1911-09-17') self.assertEqual(len(issue.pages.all()), 56) page = issue.pages.all()[0] self.assertEqual(page.sequence, 1) self.assertEqual(page.url, u'/lccn/sn83045396/1911-09-17/ed-1/seq-1/') note = page.notes.all()[1] self.assertEqual(note.type, "agencyResponsibleForReproduction") self.assertEqual(note.text, "uuml") self.assertEqual(page.sequence, 1) self.assertEqual(page.tiff_filename, 'sn83045396/print/1911091701/0001.tif') self.assertEqual(page.jp2_filename, 'sn83045396/print/1911091701/0001.jp2') self.assertEqual(page.jp2_length, 8736) self.assertEqual(page.jp2_width, 6544) self.assertEqual(page.ocr_filename, 'sn83045396/print/1911091701/0001.xml') self.assertEqual(page.pdf_filename, 'sn83045396/print/1911091701/0001.pdf') # extract ocr data just for this page loader.process_ocr(page, index=False) self.assertTrue(page.ocr != None) self.assertTrue(len(page.ocr.text) > 0) p = Title.objects.get(lccn='sn83045396').issues.all()[0].pages.all()[0] self.assertTrue(p.ocr != None) # check that the solr_doc looks legit solr_doc = page.solr_doc self.assertEqual(solr_doc['id'], '/lccn/sn83045396/1911-09-17/ed-1/seq-1/') self.assertEqual(solr_doc['type'], 'page') self.assertEqual(solr_doc['sequence'], 1) self.assertEqual(solr_doc['lccn'], 'sn83045396') self.assertEqual(solr_doc['title'], 'The Salt Lake tribune.') self.assertEqual(solr_doc['date'], '19110917') self.assertEqual(solr_doc['batch'], 'batch_uuml_thys_ver01') self.assertEqual(solr_doc['subject'], [ u'Salt Lake City (Utah)--Newspapers.', u'Utah--Salt Lake City.--fast--(OCoLC)fst01205314' ]) self.assertEqual(solr_doc['place'], [u'Utah--Salt Lake--Salt Lake City']) self.assertEqual(solr_doc['note'], [ u'Archived issues are available in digital format as part of the Library of Congress Chronicling America online collection.', u'Continues the numbering of: Salt Lake daily tribune.', u'Other eds.: Salt Lake tribune (Salt Lake City, Utah : Idaho ed.), 1954-1973, and: Salt Lake tribune (Salt Lake City, Utah : Metropolitan ed.), 1960-1972, and: Salt Lake tribune (Salt Lake City, Utah : State ed.), 1954-1974.', u'Publisher varies.', u'Semiweekly ed.: Salt Lake semi-weekly tribune, 1894-1902.', u'Weekly ed.: Salt Lake weekly tribune (Salt Lake City, Utah : 1902), 1902-< >.' ]) self.assertTrue(not solr_doc.has_key('essay')) f = os.path.join(os.path.dirname(chronam.core.__file__), 'test-data', 'uuml_thys_ocr.txt') self.assertEqual(solr_doc['ocr_eng'], file(f).read().decode('utf-8')) # purge the batch and make sure it's gone from the db loader.purge_batch('batch_uuml_thys_ver01') self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual( Title.objects.get(lccn='sn83045396').has_issues, False)
def test_load_batch(self): batch_dir = '/vol/ndnp/chronam/batches/dlc/batch_dlc_jamaica_ver01/' self.assertTrue(os.path.isdir(batch_dir)) loader = BatchLoader(process_ocr=False) batch = loader.load_batch(batch_dir) self.assertTrue(isinstance(batch, Batch)) self.assertEqual(batch.name, 'batch_dlc_jamaica_ver01') self.assertEqual(len(batch.issues.all()), 304) title = Title.objects.get(lccn='sn83030214') self.assertTrue(title.has_issues) issue = batch.issues.all()[0] self.assertEqual(issue.volume, '63') self.assertEqual(issue.number, '20620') self.assertEqual(issue.edition, 1) self.assertEqual(issue.title.lccn, 'sn83030214') self.assertEqual(date.strftime(issue.date_issued, '%Y-%m-%d'), '1903-05-01') self.assertEqual(len(issue.pages.all()), 14) page = issue.pages.all()[0] self.assertEqual(page.sequence, 1) self.assertEqual(page.url, u'/lccn/sn83030214/1903-05-01/ed-1/seq-1/') note = page.notes.all()[1] self.assertEqual(note.type, "noteAboutReproduction") self.assertEqual(note.text, "Present") # extract ocr data just for this page loader.process_ocr(page, index=False) #self.assertEqual(page.number, 1) self.assertEqual(page.sequence, 1) self.assertEqual(page.tiff_filename, 'sn83030214/00175042143/1903050101/0002.tif') self.assertEqual(page.jp2_filename, 'sn83030214/00175042143/1903050101/0002.jp2') self.assertEqual(page.jp2_length, 8898) self.assertEqual(page.jp2_width, 6520) self.assertEqual(page.ocr_filename, 'sn83030214/00175042143/1903050101/0002.xml') self.assertEqual(page.pdf_filename, 'sn83030214/00175042143/1903050101/0002.pdf') self.assertTrue(page.ocr != None) self.assertTrue(len(page.ocr.text) > 0) p = Title.objects.get(lccn='sn83030214').issues.all()[0].pages.all()[0] self.assertTrue(p.ocr != None) # check that the solr_doc looks legit solr_doc = page.solr_doc self.assertEqual(solr_doc['id'], '/lccn/sn83030214/1903-05-01/ed-1/seq-1/') self.assertEqual(solr_doc['type'], 'page') self.assertEqual(solr_doc['sequence'], 1) self.assertEqual(solr_doc['lccn'], 'sn83030214') self.assertEqual(solr_doc['title'], 'New-York tribune.') self.assertEqual(solr_doc['date'], '19030501') self.assertEqual(solr_doc['batch'], 'batch_dlc_jamaica_ver01') self.assertEqual(solr_doc['subject'], [ u'New York (N.Y.)--Newspapers.', u'New York County (N.Y.)--Newspapers.' ]) self.assertEqual(solr_doc['place'], [ u'New York--Brooklyn--New York City', u'New York--Queens--New York City' ]) self.assertEqual(solr_doc['note'], [u"I'll take Manhattan", u'The Big Apple']) self.assertTrue(not solr_doc.has_key('essay')) f = os.path.join(os.path.dirname(chronam.core.__file__), 'test-data', 'ocr.txt') self.assertEqual(solr_doc['ocr_eng'], file(f).read().decode('utf-8')) # purge the batch and make sure it's gone from the db loader.purge_batch('batch_dlc_jamaica_ver01') self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual( Title.objects.get(lccn='sn83030214').has_issues, False)
def handle(self, *args, **options): def get_immediate_subdirectories(a_dir): return [ name for name in os.listdir(a_dir) if os.path.isdir(os.path.join(a_dir, name)) ] def slack(message): sc.api_call("chat.postMessage", channel="#ghnp", text=message) start = datetime.now() sc = SlackClient(settings.SLACK_KEY) loader = BatchLoader() new_batches_location = '/opt/chronam/data/chronamftp/new_batches/' replacement_batches_location = '/opt/chronam/data/chronamftp/replacement_batches/' nonlccn_location = '/opt/chronam/data/nonlccn/' batch_drop = '/opt/chronam/data/dlg_batches/drop/' # GET LIST OF BATCHES TO LOAD new_batches = get_immediate_subdirectories(new_batches_location) replacement_batches = get_immediate_subdirectories( replacement_batches_location) # CHECK new_batches FOR finalMARC FOLDERS new_title_folders = [] for folder in new_batches: if 'MARC' in folder: new_title_folders.append(folder) new_batches.remove(folder) # ISSUE STARTING NOTIFICATIONS slack( 'Starting DLG Batch Load Process! Found `%s` new batches and `%s` replacement batches available to load.' % (len(new_batches), len(replacement_batches))) # RUN KEVIN'S RSYNC COMMANDS, WAIT slack('RSync of batches is starting') start_time = time.time() slack('Copying new batches') subprocess.call([ 'rsync -rav --progress /opt/chronam/data/chronamftp/new_batches/* /opt/chronam/data/dlg_batches/drop/' ]) slack('Copying replacement batches') subprocess.call([ 'rsync -rav --progress /opt/chronam/data/chronamftp/replacement_batches/* /opt/chronam/data/dlg_batches/drop/' ]) duration = time.time() - start_time slack('RSync of new and replacement batches completed in %s seconds' % duration) # LOAD NEW TITLES IF PRESENT if new_title_folders: slack('Also found `%s` title MARC files to process.' % len(new_title_folders)) for nt in new_title_folders: for nt_f in os.listdir(os.path.join(new_batches_location, nt)): if nt_f.endswith('.xml'): marc_file = os.path.join(nonlccn_location, nt_f) copyfile(os.path.join(new_batches_location, nt, nt_f), marc_file) title_load_results = title_loader.load(marc_file) if title_load_results[1]: slack('New title created from `%s`.' % nt_f) if title_load_results[2]: slack('Title updated from `%s`.' % nt_f) if title_load_results[3]: slack('Error on title load from `%s`' % nt_f) index_titles(start) slack('Finished loading titles.') # PURGE REPLACEMENT BATCHES if replacement_batches: slack('Purging batches destined for replacement.') for r_b in replacement_batches: batch_to_purge = r_b.replace('ver02','ver01')\ .replace('ver03','ver02')\ .replace('ver04','ver03')\ .replace('ver05','ver04')\ .replace('ver06','ver05')\ .replace('ver07','ver06')\ .replace('ver08','ver07') slack('Purging `%s`.' % batch_to_purge) loader.purge_batch(batch_to_purge) start_time = time.time() solr = SolrConnection(settings.SOLR) solr.optimize() slack('Index optimize complete in `%s` seconds.' % time.time() - start_time) # LOAD ALL BATCHES # start with replacement batches final_loader = batch_loader.BatchLoader(process_ocr=True, process_coordinates=True) if replacement_batches: replace_start = time.time() for replacement in replacement_batches: final_loader.load_batch('drop/%s' % replacement, strict=False) slack('Loaded replacement batch `%s`.' % replacement) slack('All replacement batches loaded in `%s` seconds.' % time.time() - replace_start) # load new batches if new_batches: new_start = time.time() for new in new_batches: final_loader.load_batch('drop/%s' % new, strict=False) slack('Loaded new batch `%s`.' % new) slack('All new batches loaded in `%s` seconds.' % time.time() - new_start) slack('Batch loading job complete!')
def test_load_batch(self): batch_dir = os.path.join(settings.BATCH_STORAGE, 'batch_uuml_thys_ver01') self.assertTrue(os.path.isdir(batch_dir)) loader = BatchLoader(process_ocr=False) batch = loader.load_batch(batch_dir) self.assertTrue(isinstance(batch, Batch)) self.assertEqual(batch.name, 'batch_uuml_thys_ver01') self.assertEqual(len(batch.issues.all()), 2) title = Title.objects.get(lccn='sn83045396') self.assertTrue(title.has_issues) issue = batch.issues.all()[0] self.assertEqual(issue.volume, '83') self.assertEqual(issue.number, '156') self.assertEqual(issue.edition, 1) self.assertEqual(issue.title.lccn, 'sn83045396') self.assertEqual(date.strftime(issue.date_issued, '%Y-%m-%d'), '1911-09-17') self.assertEqual(len(issue.pages.all()), 56) page = issue.pages.all()[0] self.assertEqual(page.sequence, 1) self.assertEqual(page.url, u'/lccn/sn83045396/1911-09-17/ed-1/seq-1/') note = page.notes.all()[1] self.assertEqual(note.type, "agencyResponsibleForReproduction") self.assertEqual(note.text, "uuml") self.assertEqual(page.sequence, 1) self.assertEqual(page.tiff_filename, 'sn83045396/print/1911091701/0001.tif') self.assertEqual(page.jp2_filename, 'sn83045396/print/1911091701/0001.jp2') self.assertEqual(page.jp2_length, 8736) self.assertEqual(page.jp2_width, 6544) self.assertEqual(page.ocr_filename, 'sn83045396/print/1911091701/0001.xml') self.assertEqual(page.pdf_filename, 'sn83045396/print/1911091701/0001.pdf') # extract ocr data just for this page loader.process_ocr(page) self.assertTrue(page.ocr is not None) self.assertGreater(len(page.lang_text), 0) p = Title.objects.get(lccn='sn83045396').issues.all()[0].pages.all()[0] self.assertTrue(p.ocr is not None) # check that the solr_doc looks legit solr_doc = page.solr_doc self.assertEqual(solr_doc['id'], '/lccn/sn83045396/1911-09-17/ed-1/seq-1/') self.assertEqual(solr_doc['type'], 'page') self.assertEqual(solr_doc['sequence'], 1) self.assertEqual(solr_doc['lccn'], 'sn83045396') self.assertEqual(solr_doc['title'], 'The Salt Lake tribune.') self.assertEqual(solr_doc['date'], '19110917') self.assertEqual(solr_doc['batch'], 'batch_uuml_thys_ver01') self.assertEqual(solr_doc['subject'], [ u'Salt Lake City (Utah)--Newspapers.', u'Utah--Salt Lake City.--fast--(OCoLC)fst01205314']) self.assertEqual(solr_doc['place'], [ u'Utah--Salt Lake--Salt Lake City']) self.assertEqual(solr_doc['note'], [ u'Archived issues are available in digital format as part of the Library of Congress Chronicling America online collection.', u'Continues the numbering of: Salt Lake daily tribune.', u'Other eds.: Salt Lake tribune (Salt Lake City, Utah : Idaho ed.), 1954-1973, and: Salt Lake tribune (Salt Lake City, Utah : Metropolitan ed.), 1960-1972, and: Salt Lake tribune (Salt Lake City, Utah : State ed.), 1954-1974.', u'Publisher varies.', u'Semiweekly ed.: Salt Lake semi-weekly tribune, 1894-1902.', u'Weekly ed.: Salt Lake weekly tribune (Salt Lake City, Utah : 1902), 1902-< >.']) self.assertTrue('essay' not in solr_doc) f = os.path.join(os.path.dirname(chronam.core.__file__), 'test-data', 'uuml_thys_ocr.txt') self.assertEqual(solr_doc['ocr_eng'], file(f).read().decode('utf-8')) # purge the batch and make sure it's gone from the db loader.purge_batch('batch_uuml_thys_ver01') self.assertEqual(Batch.objects.all().count(), 0) self.assertEqual(Title.objects.get(lccn='sn83045396').has_issues, False)