def handle(self, **options): solr = SolrConnection(SOLR_URL) if options['user']: solr.delete_query('user:%s' % options['user']) else: solr.delete_query('id:[* TO *]') solr.commit()
def handle(self, **options): solr = SolrConnection(settings.SOLR) if options['batch']: solr.delete_query('batch: %s' % options['batch']) else: solr.delete_query('id:[* TO *]') solr.commit()
def handle(self, **options): solr = SolrConnection(SOLR_URL) if options["user"]: solr.delete_query("user:%s" % options["user"]) else: solr.delete_query("id:[* TO *]") solr.commit()
def test_index_pages(self): solr = SolrConnection(settings.SOLR) solr.delete_query('type:page') solr.commit() self.assertEqual(si.page_count(), 0) si.index_pages() self.assertEqual(si.page_count(), 2)
def index_titles(since=None): """index all the titles and holdings that are modeled in the database if you pass in a datetime object as the since parameter only title records that have been created since that time will be indexed. """ cursor = connection.cursor() solr = SolrConnection(settings.SOLR) if since: cursor.execute("SELECT lccn FROM core_title WHERE created >= '%s'" % since) else: solr.delete_query('type:title') cursor.execute("SELECT lccn FROM core_title") count = 0 while True: row = cursor.fetchone() if row == None: break title = models.Title.objects.get(lccn=row[0]) index_title(title, solr) count += 1 if count % 100 == 0: _log.info("indexed %s titles" % count) reset_queries() solr.commit() solr.commit()
def index_titles(since=None): """index all the titles and holdings that are modeled in the database if you pass in a datetime object as the since parameter only title records that have been created since that time will be indexed. """ cursor = connection.cursor() solr = SolrConnection(settings.SOLR) if since: cursor.execute("SELECT lccn FROM core_title WHERE created >= '%s'" % since) else: solr.delete_query('type:title') cursor.execute("SELECT lccn FROM core_title") count = 0 while True: row = cursor.fetchone() if row is None: break title = models.Title.objects.get(lccn=row[0]) index_title(title, solr) count += 1 if count % 100 == 0: LOGGER.info("indexed %s titles", count) reset_queries() solr.commit() solr.commit()
def solr_reindex (self): """ Reindex all entries. Used when switching to/from "private" status. """ solr_conn = SolrConnection(settings.SOLR_URL) # Start by deleting 'em all solr_conn.delete_query('user:%s' % self.user.id) entries = Entry.objects.filter(user=self.user) docs = [] # Arbitrary assignment of a constant, here. SLICE_SIZE = 50 slices = [x for x in range(entries.count()) \ if x % SLICE_SIZE == 0] for s in slices: entry_slice = entries[s:s+SLICE_SIZE] for entry in entry_slice: docs.append(entry.solr_doc) if len(docs) == SLICE_SIZE: try: solr_conn.add_many(docs) except: # should log appropriately, huh pass del(docs) docs = [] # Don't miss the leftovers solr_conn.add_many(docs) solr_conn.commit() solr_conn.optimize()
def solr_delete(self): """ Remove from solr index """ solr_conn = SolrConnection(settings.SOLR_URL, persistent=False) solr_conn.delete_query('id:%s' % self.id) solr_conn.commit()
def delete_title(title, solr=None): if not solr: solr = SolrConnection(settings.SOLR) if isinstance(title, models.Title): title_id = title.url else: title_id = title q = "+type:title +id:%s" % title_id solr.delete_query(q) LOGGER.info("deleted title %s from the index", title)
def finished(self, **kwargs): source_id = kwargs['source_id'] track_id = kwargs['track_id'] # Build up a Solr query filters = [] filters.append('type:request') filters.append('channel_id:%s' % self.channel_id) filters.append('request_source_id:%s' % source_id) filters.append('request_track_id:%s' % track_id) # Make the request to Solr solr = SolrConnection(settings.SOLR_URL) solr.delete_query(' AND '.join(filters)) solr.commit()
def index_pages(): """index all the pages that are modeled in the database """ solr = SolrConnection(settings.SOLR) solr.delete_query('type:page') cursor = connection.cursor() cursor.execute("SELECT id FROM core_page") count = 0 while True: row = cursor.fetchone() if row is None: break page = models.Page.objects.get(id=row[0]) LOGGER.info("[%s] indexing page: %s", count, page.url) solr.add(**page.solr_doc) count += 1 if count % 100 == 0: reset_queries() solr.commit()
class BatchLoader(object): """This class allows you to load a batch into the database. A loader object serves as a context for a particular batch loading job. """ def __init__(self, process_ocr=True, process_coordinates=True): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates def _find_batch_file(self, batch): """ TODO: Who can we toss the requirement at to make this available in a canonical location? """ # look for batch_1.xml, BATCH_1.xml, etc for alias in [ "batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml", ]: # TODO: might we want 'batch.xml' first? Leaving last for now to # minimize impact. url = urlparse.urljoin(batch.storage_url, alias) try: urllib2.urlopen(url) validated_batch_file = alias break except (urllib2.HTTPError, urllib2.URLError): continue else: raise BatchLoaderException( "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path) return validated_batch_file def _sanity_check_batch(self, batch): # if not os.path.exists(batch.path): # raise BatchLoaderException("batch does not exist at %s" % batch.path) # b = urllib2.urlopen(batch.url) batch.validated_batch_file = self._find_batch_file(batch) def load_batch(self, batch_path, strict=True): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ LOGGER.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if batch_path != link_name and not os.path.islink(link_name): LOGGER.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) if not strict: try: batch = Batch.objects.get(name=batch_name) LOGGER.info("Batch already loaded: %s", batch_name) return batch except Batch.DoesNotExist as e: pass LOGGER.info("loading batch: %s", batch_name) event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath("ndnp:reel", namespaces=ns): reel_number = e.attrib["reelNumber"].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath("ndnp:issue", namespaces=ns): mets_url = urlparse.urljoin(batch.storage_url, e.text) try: issue, pages = self._load_issue(mets_url) except ValueError as e: LOGGER.exception("Unable to load issue from %s", mets_url) continue # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: LOGGER.info("Adding pages to solr index from issue %s", issue.title) for page in pages: LOGGER.debug("indexing ocr for: %s", page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save() if self.PROCESS_OCR: LOGGER.info("Committing solr index") self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count LOGGER.info(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception: LOGGER.exception( "Unable to purge batch %s after loading failed", batch_name) raise BatchLoaderException(msg) if settings.IS_PRODUCTION: batch.released = datetime.now() batch.save() cache.delete("newspaper_info") return batch def _get_batch(self, batch_name, batch_source=None, create=False): if create: batch = self._create_batch(batch_name, batch_source) else: batch = Batch.objects.get(name=batch_name) return batch def _create_batch(self, batch_name, batch_source): if Batch.objects.filter(name=batch_name).count() != 0: raise BatchLoaderException("batch %s already loaded" % batch_name) batch = Batch() batch.name = batch_name batch.source = batch_source try: parts = batch_name.split("_", 3) if len(parts) == 4: parts = parts[1:] awardee_org_code, name_part, version = parts batch.awardee = Awardee.objects.get(org_code=awardee_org_code) except Awardee.DoesNotExist: msg = "no awardee for org code: %s" % awardee_org_code LOGGER.error(msg) raise BatchLoaderException(msg) batch.save() return batch def _load_issue(self, mets_file): LOGGER.debug("parsing issue mets file: %s", mets_file) doc = etree.parse(mets_file) # get the mods for the issue div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0] dmdid = div.attrib["DMDID"] mods = dmd_mods(doc, dmdid) # set up a new Issue issue = Issue() issue.volume = mods.xpath( 'string(.//mods:detail[@type="volume"]/mods:number[1])', namespaces=ns).strip() issue.number = mods.xpath( 'string(.//mods:detail[@type="issue"]/mods:number[1])', namespaces=ns).strip() issue.edition = int( mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:number[1])', namespaces=ns)) issue.edition_label = mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:caption[1])', namespaces=ns).strip() # parse issue date date_issued = mods.xpath("string(.//mods:dateIssued)", namespaces=ns) issue.date_issued = datetime.strptime(date_issued, "%Y-%m-%d") # attach the Issue to the appropriate Title lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])', namespaces=ns).strip() try: title = Title.objects.get(lccn=lccn) except title.DoesNotExist: url = "https://chroniclingamerica.loc.gov/lccn/%s/marc.xml" % lccn LOGGER.info("attempting to load MARC record from %s", url) management.call_command("load_titles", url) title = Title.objects.get(lccn=lccn) issue.title = title issue.batch = self.current_batch issue.save() LOGGER.debug("saved issue: %s", issue.url) notes = [] for mods_note in mods.xpath(".//mods:note", namespaces=ns): type = mods_note.xpath("string(./@type)") label = mods_note.xpath("string(./@displayLabel)") text = mods_note.xpath("string(.)") note = models.IssueNote(type=type, label=label, text=text) notes.append(note) issue.notes = notes issue.save() # attach pages: lots of logging because it's expensive pages = [] for page_div in div.xpath('.//mets:div[@TYPE="np:page"]', namespaces=ns): try: pages.append(self._load_page(doc, page_div, issue)) except BatchLoaderException: LOGGER.exception( "Failed to load page. doc: %s, page div: %s, issue: %s", doc, page_div, issue) return issue, pages def _load_page(self, doc, div, issue): dmdid = div.attrib["DMDID"] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath("string(.//mods:extent/mods:start)", namespaces=ns) try: page.sequence = int(seq_string) except ValueError: raise BatchLoaderException( "could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath('string(.//mods:detail[@type="page number"])', namespaces=ns).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: LOGGER.warning("unable to find reel number in page metadata") LOGGER.info("Assigned page sequence: %s", page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue LOGGER.info("Saving page. issue date: %s, page sequence: %s", issue.date_issued, page.sequence) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath(".//mods:note", namespaces=ns): type = mods_note.xpath("string(./@type)") label = mods_note.xpath("string(./@displayLabel)") text = mods_note.xpath("string(.)").strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes = notes # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath("./mets:fptr", namespaces=ns): file_id = fptr.attrib["FILEID"] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib["USE"] # get the filename relative to the storage location file_name = file_el.xpath("string(./mets:FLocat/@xlink:href)", namespaces=ns) file_name = urlparse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == "master": page.tiff_filename = file_name elif file_type == "service": page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib["ADMID"].split(" "): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError: LOGGER.info( "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder...", page.issue, page, ) if not page.jp2_width: raise BatchLoaderException( "No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException( "No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == "derivative": page.pdf_filename = file_name elif file_type == "ocr": page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: page = self.process_ocr(page) else: LOGGER.info("No ocr filename for issue: %s page: %s", page.issue, page) LOGGER.debug("saving page: %s", page.url) page.save() return page def process_ocr(self, page): LOGGER.debug("extracting ocr text and word coords for %s", page.url) url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() lang_text_solr = {} for lang, text in lang_text.iteritems(): try: language = models.Language.objects.get( Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: LOGGER.warning( "Language %s does not exist in the database. Defaulting to English.", lang) # default to english as per requirement language = models.Language.objects.get(code="eng") ocr.language_texts.create(language=language) lang_text_solr[language.code] = text page.ocr = ocr page.lang_text = lang_text_solr page.save() return page def _process_coordinates(self, page, coords): LOGGER.debug("writing out word coords for %s", page.url) # We'll use a temporary file in case the coordinates dir is configured # to a network filesystem which has poor update performance # characteristics fd, path = tempfile.mkstemp(text="w", suffix=".coordinates", dir=settings.TEMP_STORAGE) f = open(path, "w") f.write(gzip_compress(json.dumps(coords))) f.close() os.close(fd) final_path = models.coordinates_path(page._url_parts()) try: shutil.move(path, final_path) except Exception: LOGGER.warning( 'Could not move coordinates to "%s". Waiting 5 seconds before trying again…', final_path) time.sleep(5) shutil.move(path, final_path) def process_coordinates(self, batch_path): LOGGER.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): if not page.ocr_filename: LOGGER.warning( "Batch [%s] page [%s] has no OCR; skipping coordinates processing", batch_name, page, ) else: url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) LOGGER.debug("Extracting OCR from url %s", url) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception as e: msg = "unable to process coordinates for batch: %s" % e LOGGER.exception(msg) raise BatchLoaderException(msg) def storage_relative_path(self, path): """returns a relative path for a given file path within a batch, so that storage can be re-homed without having to rewrite paths in the db """ rel_path = path.replace(self.current_batch.storage_url, "") return rel_path @transaction.atomic def purge_batch(self, batch_name): batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name) except Batch.DoesNotExist: LOGGER.info("Batch %s does not exist", batch_name) return event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): LOGGER.info("Removing symlink %s", link_name) os.remove(link_name) except Exception as e: msg = "purge failed: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg) def _purge_batch(self, batch): batch_name = batch.name # just delete batch causes memory to bloat out # so we do it piece-meal for issue in batch.issues.prefetch_related("pages__issue", "pages__issue__title"): for page in issue.pages.all(): page.delete() # remove coordinates if os.path.exists(models.coordinates_path(page._url_parts())): os.remove(models.coordinates_path(page._url_parts())) issue.delete() batch.delete() if self.PROCESS_OCR: self.solr.delete_query('batch:"%s"' % batch_name) self.solr.commit()
def uninstall(field, data, path, length, mode): from solr import SolrConnection s = SolrConnection(SOLR_URL) s.delete_query('id:*') s.commit()
def delete_title(title): solr = SolrConnection(settings.SOLR) q = '+type:title +id:%s' % title.solr_doc['id'] solr.delete_query(q) LOGGER.info("deleted title %s from the index", title)
def search(field, data, path, hlength, mode): from termcolor import colored from solr import SolrConnection #hlength = int(hlength) #search solr, get filePath, do a grep and show the line #print 'search' s = SolrConnection(SOLR_URL) if field == 'name': query = 'name:"' + data + '"' response = s.query(query) elif field == 'txt': query = 'txt:"' + data + '"' #response = s.query(query, hl=True, hl.q='txt:bandits', hl.fl='txt', hl.fragsize=50, hl.preserveMulti=True, hl.snippets=100) if hlength: response = s.query(query, fl='id,name', highlight=True, fields = 'txt', hl_q=query, hl_fragsize=hlength, hl_snippets=1000, hl_bs_type = 'SENTENCE') else: response = s.query(query, fl='id,name') else: query = 'name:"' + data + '" OR txt:"' + data + '"' #response = s.query(query, hl=True, hl.q='txt:bandits', hl.fl='txt', hl.fragsize=50, hl.preserveMulti=True, hl.snippets=100) if hlength: response = s.query(query, fl='id,name', highlight=True, fields = 'txt', hl_q=query, hl_fragsize=hlength, hl_snippets=1000, hl_bs_type = 'SENTENCE') else: response = s.query(query, fl='id,name') #print query #print response.__dict__ #print response.highlighting if hlength and field != 'name': hlength = int(hlength) for id in response.highlighting: if os.path.isfile(id): if response.highlighting[id]: for txt in response.highlighting[id]['txt']: txt = txt.strip() startpos = txt.index('<em>') endpos = txt.rindex('</em>') print (txt[:startpos] + colored(txt[startpos+4:endpos], 'red') + txt[endpos+5:]).replace('<em>', '').replace('</em>', '') else: fdata = open(id, 'r').read().decode('raw_unicode_escape').replace('\n',' ').replace('\t',' ') fdata = filter(lambda x: x in string.printable, fdata) for m in re.finditer( data, fdata ): start = m.start()-hlength if start < 0 : start = 0 end = m.end() + hlength if end > len(fdata): end = len(fdata) print (fdata[start:m.start()] + colored(fdata[m.start():m.end()], 'red') + fdata[m.end():end]).replace('<em>', '').replace('</em>', '') if id.endswith(('.mp3')): if mode == 'slow': x = raw_input('press `y` to play, `n` to move forward \n') if x == 'y': subprocess.call(["afplay", id]) else: print '\t To open the file press cmd + double click ' print colored("file://"+id, 'blue') print '\n \n' if mode == 'slow': raw_input('press any key to continue \n') else: s.delete_query('id:'+id) else: for hit in response.results: if hit['id']: if hit['id'].endswith(('.mp3')): if mode == 'slow': x = raw_input('press `y` to play, `n` to move forward \n') if x == 'y': subprocess.call(["afplay", hit['id']]) else: print '\t To open the file press cmd + double click ' print colored("file://"+hit['id'], 'blue') print '\n \n' if mode == 'slow': raw_input('press any key to continue \n') else: s.delete_query('id:'+hit['id'])
def delete_title(title): solr = SolrConnection(settings.SOLR) q = '+type:title +id:%s' % title.solr_doc['id'] r = solr.delete_query(q) _log.info("deleted title %s from the index" % title)
def tearDownClass(cls): solrconn = SolrConnection(settings.SOLR_SERVER) solrconn.delete_query('type:cantusdata_music_notation AND manuscript:' + MEI_FIXTURE_SIGLUM) solrconn.delete_query('type:cantusdata_folio AND manuscript_id:{0}'.format(MEI_FIXTURE_ID)) solrconn.commit()
class BatchLoader(object): """This class allows you to load a batch into the database. A loader object serves as a context for a particular batch loading job. """ def __init__(self, process_ocr=True, process_coordinates=True): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates def _find_batch_file(self, batch): """ TODO: Who can we toss the requirement at to make this available in a canonical location? """ # look for batch_1.xml, BATCH_1.xml, etc for alias in ["batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml"]: # TODO: might we want 'batch.xml' first? Leaving last for now to # minimize impact. url = urlparse.urljoin(batch.storage_url, alias) try: urllib2.urlopen(url) validated_batch_file = alias break except (urllib2.HTTPError, urllib2.URLError): continue else: raise BatchLoaderException( "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path) return validated_batch_file def _sanity_check_batch(self, batch): # if not os.path.exists(batch.path): # raise BatchLoaderException("batch does not exist at %s" % batch.path) #b = urllib2.urlopen(batch.url) batch.validated_batch_file = self._find_batch_file(batch) def load_batch(self, batch_path, strict=True): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ LOGGER.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if batch_path != link_name and not os.path.islink(link_name): LOGGER.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) if not strict: try: batch = Batch.objects.get(name=batch_name) LOGGER.info("Batch already loaded: %s", batch_name) return batch except Batch.DoesNotExist as e: pass LOGGER.info("loading batch: %s", batch_name) event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urlparse.urljoin(batch.storage_url, e.text) try: issue, pages = self._load_issue(mets_url) except ValueError as e: LOGGER.exception(e) continue # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: LOGGER.info("Adding pages to solr index from issue %s", issue.title) for page in pages: LOGGER.debug("indexing ocr for: %s", page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save() if self.PROCESS_OCR: LOGGER.info("Committing solr index") self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count LOGGER.info(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception as pbe: LOGGER.error("purge batch failed for failed load batch: %s", pbe) LOGGER.exception(pbe) raise BatchLoaderException(msg) if settings.IS_PRODUCTION: batch.released = datetime.now() batch.save() return batch def _get_batch(self, batch_name, batch_source=None, create=False): if create: batch = self._create_batch(batch_name, batch_source) else: batch = Batch.objects.get(name=batch_name) return batch def _create_batch(self, batch_name, batch_source): if Batch.objects.filter(name=batch_name).count() != 0: raise BatchLoaderException("batch %s already loaded" % batch_name) batch = Batch() batch.name = batch_name batch.source = batch_source try: parts = batch_name.split("_", 3) if len(parts) is 4: parts = parts[1:] awardee_org_code, name_part, version = parts batch.awardee = Awardee.objects.get(org_code=awardee_org_code) except Awardee.DoesNotExist: msg = "no awardee for org code: %s" % awardee_org_code LOGGER.error(msg) raise BatchLoaderException(msg) batch.save() return batch def _load_issue(self, mets_file): LOGGER.debug("parsing issue mets file: %s", mets_file) doc = etree.parse(mets_file) # get the mods for the issue div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0] dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) # set up a new Issue issue = Issue() issue.volume = mods.xpath( 'string(.//mods:detail[@type="volume"]/mods:number[1])', namespaces=ns).strip() issue.number = mods.xpath( 'string(.//mods:detail[@type="issue"]/mods:number[1])', namespaces=ns).strip() issue.edition = int(mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:number[1])', namespaces=ns)) issue.edition_label = mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:caption[1])', namespaces=ns).strip() # parse issue date date_issued = mods.xpath('string(.//mods:dateIssued)', namespaces=ns) issue.date_issued = datetime.strptime(date_issued, '%Y-%m-%d') # attach the Issue to the appropriate Title lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])', namespaces=ns).strip() try: title = Title.objects.get(lccn=lccn) except Exception as e: url = 'http://chroniclingamerica.loc.gov/lccn/%s/marc.xml' % lccn LOGGER.info("attempting to load marc record from %s", url) management.call_command('load_titles', url) title = Title.objects.get(lccn=lccn) issue.title = title issue.batch = self.current_batch issue.save() LOGGER.debug("saved issue: %s", issue.url) notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)') note = models.IssueNote(type=type, label=label, text=text) notes.append(note) issue.notes = notes issue.save() # attach pages: lots of logging because it's expensive pages = [] for page_div in div.xpath('.//mets:div[@TYPE="np:page"]', namespaces=ns): try: pages.append(self._load_page(doc, page_div, issue)) except BatchLoaderException as e: LOGGER.error("Failed to load page. doc: %s, page div: %s, issue: %s", doc, page_div, issue) LOGGER.exception(e) return issue, pages def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath( 'string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError: raise BatchLoaderException("could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath( 'string(.//mods:detail[@type="page number"])', namespaces=ns ).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns ).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: LOGGER.warn("unable to find reel number in page metadata") LOGGER.info("Assigned page sequence: %s", page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue LOGGER.info("Saving page. issue date: %s, page sequence: %s", issue.date_issued, page.sequence) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes = notes # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE'] # get the filename relative to the storage location file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)', namespaces=ns) file_name = urlparse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == 'master': page.tiff_filename = file_name elif file_type == 'service': page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib['ADMID'].split(' '): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError: LOGGER.info("Could not determine dimensions of jp2 for issue: %s page: %s... trying harder...", page.issue, page) if not page.jp2_width: raise BatchLoaderException("No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException("No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == 'derivative': page.pdf_filename = file_name elif file_type == 'ocr': page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: page = self.process_ocr(page) else: LOGGER.info("No ocr filename for issue: %s page: %s", page.issue, page) LOGGER.debug("saving page: %s", page.url) page.save() return page def process_ocr(self, page): LOGGER.debug("extracting ocr text and word coords for %s", page.url) url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() lang_text_solr = {} for lang, text in lang_text.iteritems(): try: language = models.Language.objects.get(Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: LOGGER.warn("Language %s does not exist in the database. Defaulting to English.", lang) # default to english as per requirement language = models.Language.objects.get(code='eng') ocr.language_texts.create(language=language) lang_text_solr[language.code] = text page.ocr = ocr page.lang_text = lang_text_solr page.save() return page def _process_coordinates(self, page, coords): LOGGER.debug("writing out word coords for %s", page.url) fd, path = tempfile.mkstemp(text="w", suffix=".coordinates", dir=settings.TEMP_STORAGE) # get a temp file in case the coordinates dir is a NFS or S3 mount which have poor multiple write performance f = open(path, "w") f.write(gzip_compress(json.dumps(coords))) f.close() os.close(fd) final_path = models.coordinates_path(page._url_parts()) try: shutil.move(path, final_path) except Exception: LOGGER.warn("Could not move coordinates to [%s]. Waiting 5 seconds and trying again in case of network mount", final_path) time.sleep(5) shutil.move(path, final_path) def process_coordinates(self, batch_path): LOGGER.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urlparse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): if not page.ocr_filename: LOGGER.warn("Batch [%s] has page [%s] that has no OCR. Skipping processing coordinates for page." % (batch_name, page)) else: url = urlparse.urljoin(self.current_batch.storage_url, page.ocr_filename) LOGGER.debug("Extracting OCR from url %s", url) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception as e: msg = "unable to process coordinates for batch: %s" % e LOGGER.exception(msg) raise BatchLoaderException(msg) def storage_relative_path(self, path): """returns a relative path for a given file path within a batch, so that storage can be re-homed without having to rewrite paths in the db """ rel_path = path.replace(self.current_batch.storage_url, '') return rel_path @transaction.atomic def purge_batch(self, batch_name): event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: batch = self._get_batch(batch_name) self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): LOGGER.info("Removing symlink %s", link_name) os.remove(link_name) except Exception as e: msg = "purge failed: %s" % e LOGGER.exception(msg) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg) def _purge_batch(self, batch): batch_name = batch.name # just delete batch causes memory to bloat out # so we do it piece-meal for issue in batch.issues.all(): for page in issue.pages.all(): page.delete() # remove coordinates if os.path.exists(models.coordinates_path(page._url_parts())): os.remove(models.coordinates_path(page._url_parts())) issue.delete() batch.delete() if self.PROCESS_OCR: self.solr.delete_query('batch:"%s"' % batch_name) self.solr.commit()
def index_pages(only_missing=False): """index all the pages that are modeled in the database """ solr = SolrConnection(settings.SOLR) page_qs = models.Page.objects.order_by("pk") if only_missing: page_qs = page_qs.filter(indexed=False) else: # FIXME: we should not churn the index when documents have not been deleted: solr.delete_query("type:page") # To avoid MySQL limitations, we'll run two queries: the first will only # lookup the primary keys to allow MySQL to satisfy the ORDER BY / LIMIT # using only the index and then we'll use the primary keys to lookup the # full Page objects for each chunk which will actually be indexed. full_page_qs = page_qs.prefetch_related( Prefetch( "issue", queryset=models.Issue.objects.prefetch_related( "batch", "title", "title__languages", "title__alt_titles", "title__subjects", "title__notes", "title__places", "title__urls", "title__essays", "title__country", "title__holdings", ), )) count = 0 for pk_chunk in sliced(page_qs.values_list("pk", flat=True), 100): # We have to force the PKs into a list to work around limitations in # MySQL preventing the use of a subquery which uses LIMIT: chunk = full_page_qs.filter(pk__in=list(pk_chunk)) docs = [] pks = [] for page in chunk: try: docs.append(page.solr_doc) pks.append(page.pk) except Exception: LOGGER.warning("Unable to index page %s", page.url, exc_info=True) continue if docs: solr.add_many(docs) solr.commit() models.Page.objects.filter(pk__in=pks).update(indexed=True) count += len(pk_chunk) reset_queries() LOGGER.info("indexed %d pages", count) solr.commit()
class BatchLoader(object): """This class allows you to load a batch into the database. A loader object serves as a context for a particular batch loading job. """ def __init__(self, process_ocr=True, process_coordinates=True): """Create a BatchLoader. The process_ocr parameter is used (mainly in testing) when we don't want to spend time actually extracting ocr text and indexing. """ self.PROCESS_OCR = process_ocr if self.PROCESS_OCR: self.solr = SolrConnection(settings.SOLR) self.PROCESS_COORDINATES = process_coordinates def _find_batch_file(self, batch): """ TODO: Who can we toss the requirement at to make this available in a canonical location? """ # look for batch_1.xml, BATCH_1.xml, etc for alias in [ "batch_1.xml", "BATCH_1.xml", "batchfile_1.xml", "batch_2.xml", "BATCH_2.xml", "batch.xml" ]: # TODO: might we want 'batch.xml' first? Leaving last for now to # minimize impact. url = urllib.parse.urljoin(batch.storage_url, alias) try: u = urllib.request.urlopen(url) validated_batch_file = alias break except urllib.error.HTTPError as e: continue except urllib.error.URLError as e: continue else: raise BatchLoaderException( "could not find batch_1.xml (or any of its aliases) in '%s' -- has the batch been validated?" % batch.path) return validated_batch_file def _sanity_check_batch(self, batch): #if not os.path.exists(batch.path): # raise BatchLoaderException("batch does not exist at %s" % batch.path) #b = urllib2.urlopen(batch.url) batch.validated_batch_file = self._find_batch_file(batch) def load_batch(self, batch_path): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ self.pages_processed = 0 # Trailing slash breaks comparison to link_name below, so strip off batch_path = batch_path.rstrip("/") _logger.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) # Create symlink if paths don't match, symlink not already there, # and batch_path wasn't input with a BATCH_STORAGE symlink path if (batch_path != link_name and not os.path.islink(link_name) and not (os.path.islink(settings.BATCH_STORAGE) and batch_path.startswith( os.path.realpath(settings.BATCH_STORAGE)))): _logger.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = Batch.objects.get(name=batch_name) _logger.info("Batch already loaded: %s" % batch_name) return batch except Batch.DoesNotExist as e: pass _logger.info("loading batch: %s" % batch_name) t0 = time() times = [] event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urllib.parse.urljoin(batch.storage_url, e.text) try: issue = self._load_issue(mets_url) except ValueError as e: _logger.exception(e) continue reset_queries() times.append((time() - t0, self.pages_processed)) # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count event = LoadBatchEvent(batch_name=batch_name, message=msg) _logger.info(msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception as pbe: _logger.error("purge batch failed for failed load batch: %s" % pbe) _logger.exception(pbe) raise BatchLoaderException(msg) # updates the min and max years of all titles set_fulltext_range() return batch def _get_batch(self, batch_name, batch_source=None, create=False): if create: batch = self._create_batch(batch_name, batch_source) else: batch = Batch.objects.get(name=batch_name) return batch def _create_batch(self, batch_name, batch_source): if Batch.objects.filter(name=batch_name).count() != 0: raise BatchLoaderException("batch %s already loaded" % batch_name) batch = Batch() batch.name = batch_name batch.source = batch_source try: _, org_code, name_part, version = batch_name.split("_", 3) awardee_org_code = org_code batch.awardee = Awardee.objects.get(org_code=awardee_org_code) except Awardee.DoesNotExist as e: msg = "no awardee for org code: %s" % awardee_org_code _logger.error(msg) raise BatchLoaderException(msg) batch.save() return batch def _load_issue(self, mets_file): _logger.debug("parsing issue mets file: %s" % mets_file) doc = etree.parse(mets_file) # get the mods for the issue div = doc.xpath('.//mets:div[@TYPE="np:issue"]', namespaces=ns)[0] dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) # set up a new Issue issue = Issue() issue.volume = mods.xpath( 'string(.//mods:detail[@type="volume"]/mods:number[1])', namespaces=ns).strip() issue.number = mods.xpath( 'string(.//mods:detail[@type="issue"]/mods:number[1])', namespaces=ns).strip() issue.edition = int( mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:number[1])', namespaces=ns)) issue.edition_label = mods.xpath( 'string(.//mods:detail[@type="edition"]/mods:caption[1])', namespaces=ns).strip() # parse issue date date_issued = mods.xpath('string(.//mods:dateIssued)', namespaces=ns) issue.date_issued = datetime.strptime(date_issued, '%Y-%m-%d') # attach the Issue to the appropriate Title lccn = mods.xpath('string(.//mods:identifier[@type="lccn"])', namespaces=ns).strip() try: title = Title.objects.get(lccn=lccn) except Exception as e: url = settings.MARC_RETRIEVAL_URLFORMAT % lccn _logger.info("attempting to load marc record from %s", url) management.call_command('load_titles', url) title = Title.objects.get(lccn=lccn) issue.title = title issue.batch = self.current_batch issue.save() _logger.debug("saved issue: %s" % issue.url) notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)') note = models.IssueNote(type=type, label=label, text=text) notes.append(note) issue.notes.set(notes, bulk=False) issue.save() # attach pages: lots of logging because it's expensive for page_div in div.xpath('.//mets:div[@TYPE="np:page"]', namespaces=ns): try: page = self._load_page(doc, page_div, issue) self.pages_processed += 1 except BatchLoaderException as e: _logger.exception(e) return issue def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath('string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError as e: raise BatchLoaderException( "could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath('string(.//mods:detail[@type="page number"])', namespaces=ns).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist as e: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: _logger.warn("unable to find reel number in page metadata") _logger.info("Assigned page sequence: %s" % page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue _logger.info("Saving page. issue date: %s, page sequence: %s" % (issue.date_issued, page.sequence)) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes.set(notes, bulk=False) # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE'] # get the filename relative to the storage location file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)', namespaces=ns) file_name = urllib.parse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == 'master': page.tiff_filename = file_name elif file_type == 'service': page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib['ADMID'].split(' '): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError as e: _logger.info( "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder..." % (page.issue, page)) im = Image.open(page.jp2_abs_filename) page.jp2_width, page.jp2_length = im.size if not page.jp2_width: raise BatchLoaderException( "No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException( "No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == 'derivative': page.pdf_filename = file_name elif file_type == 'ocr': page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: self.process_ocr(page) else: _logger.info("No ocr filename for issue: %s page: %s" % (page.issue, page)) _logger.debug("saving page: %s" % page.url) page.save() return page def process_ocr(self, page, index=True): _logger.debug("extracting ocr text and word coords for %s" % page.url) url = urllib.parse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) if self.PROCESS_COORDINATES: self._process_coordinates(page, coords) ocr = OCR() ocr.page = page ocr.save() for lang, text in lang_text.items(): try: language = models.Language.objects.get( Q(code=lang) | Q(lingvoj__iendswith=lang)) except models.Language.DoesNotExist: # default to english as per requirement language = models.Language.objects.get(code='eng') ocr.language_texts.create(language=language, text=text) page.ocr = ocr if index: _logger.debug("indexing ocr for: %s" % page.url) self.solr.add(**page.solr_doc) page.indexed = True page.save() def _process_coordinates(self, page, coords): _logger.debug("writing out word coords for %s" % page.url) f = open(models.coordinates_path(page._url_parts()), "wb") f.write(gzip_compress(json.dumps(coords).encode('utf-8'))) f.close() def process_coordinates(self, batch_path): _logger.info("process word coordinates for batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path.rstrip("/")) if dirname: batch_source = None else: batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = self._get_batch(batch_name, batch_source, create=False) self.current_batch = batch for issue in batch.issues.all(): for page in issue.pages.all(): url = urllib.parse.urljoin(self.current_batch.storage_url, page.ocr_filename) lang_text, coords = ocr_extractor(url) self._process_coordinates(page, coords) except Exception as e: msg = "unable to process coordinates for batch: %s" % e _logger.error(msg) _logger.exception(e) raise BatchLoaderException(msg) def storage_relative_path(self, path): """returns a relative path for a given file path within a batch, so that storage can be re-homed without having to rewrite paths in the db """ rel_path = path.replace(self.current_batch.storage_url, '') return rel_path def purge_batch(self, batch_name): event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: batch = self._get_batch(batch_name) self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): _logger.info("Removing symlink %s", link_name) os.remove(link_name) # updates the min and max years of all titles set_fulltext_range() except Exception as e: msg = "purge failed: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg) def _purge_batch(self, batch): batch_name = batch.name # just delete batch causes memory to bloat out # so we do it piece-meal for issue in batch.issues.all(): for page in issue.pages.all(): page.delete() # remove coordinates if os.path.exists(models.coordinates_path(page._url_parts())): os.remove(models.coordinates_path(page._url_parts())) reset_queries() issue.delete() batch.delete() if self.PROCESS_OCR: self.solr.delete_query('batch:"%s"' % batch_name) self.solr.commit()
silo_metadata = {} tries = 0 while tries < 5: response = db.getSiloState(silo_name) if db.good(response): silo_metadata = response.results break else: tries += 1 solr_doc["title"] = "" if "title" in silo_metadata: solr_doc["title"] = silo_metadata["title"] solr_doc["description"] = "" if "description" in silo_metadata: solr_doc["description"] = silo_metadata["description"] solr.add(_commit=False, **solr_doc) rq.task_complete() elif msg["type"] == "d": # Deletion itemid = msg.get("id", None) if itemid: logger.info("Got deletion message on id:%s in silo:%s" % (itemid, silo_name)) query = 'silo:"%s" AND id:"%s"' % (silo_name, itemid) solr.delete_query(query) elif silo_name: logger.info("Got deletion message on silo:%s" % silo_name) query = 'silo:"%s"' % silo_name solr.delete_query(query) # solr.commit() rq.task_complete()