def purge_batch(self, batch_name): event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: batch = self._get_batch(batch_name) self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): _logger.info("Removing symlink %s", link_name) os.remove(link_name) except Exception, e: msg = "purge failed: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg)
def purge_batch(self, batch_name): event = LoadBatchEvent(batch_name=batch_name, message="starting purge") event.save() try: batch = self._get_batch(batch_name) self._purge_batch(batch) event = LoadBatchEvent(batch_name=batch_name, message="purged") event.save() # clean up symlinks if exists link_name = os.path.join(settings.BATCH_STORAGE, batch_name) if os.path.islink(link_name): _logger.info("Removing symlink %s", link_name) os.remove(link_name) # updates the min and max years of all titles set_fulltext_range() except Exception as e: msg = "purge failed: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() raise BatchLoaderException(msg)
def load_batch(self, batch_path): """Load a batch, and return a Batch instance for the batch that was loaded. loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01') """ self.pages_processed = 0 # Trailing slash breaks comparison to link_name below, so strip off batch_path = batch_path.rstrip("/") _logger.info("loading batch at %s", batch_path) dirname, batch_name = os.path.split(batch_path) if dirname: batch_source = None link_name = os.path.join(settings.BATCH_STORAGE, batch_name) # Create symlink if paths don't match, symlink not already there, # and batch_path wasn't input with a BATCH_STORAGE symlink path if (batch_path != link_name and not os.path.islink(link_name) and not (os.path.islink(settings.BATCH_STORAGE) and batch_path.startswith( os.path.realpath(settings.BATCH_STORAGE)))): _logger.info("creating symlink %s -> %s", batch_path, link_name) os.symlink(batch_path, link_name) else: batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE, batch_name) if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = Batch.objects.get(name=batch_name) _logger.info("Batch already loaded: %s" % batch_name) return batch except Batch.DoesNotExist as e: pass _logger.info("loading batch: %s" % batch_name) t0 = time() times = [] event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns): reel_number = e.attrib['reelNumber'].strip() try: reel = models.Reel.objects.get(number=reel_number, batch=batch) except models.Reel.DoesNotExist as e: reel = models.Reel(number=reel_number, batch=batch) reel.save() for e in doc.xpath('ndnp:issue', namespaces=ns): mets_url = urllib.parse.urljoin(batch.storage_url, e.text) try: issue = self._load_issue(mets_url) except ValueError as e: _logger.exception(e) continue reset_queries() times.append((time() - t0, self.pages_processed)) # commit new changes to the solr index, if we are indexing if self.PROCESS_OCR: self.solr.commit() batch.save() msg = "processed %s pages" % batch.page_count event = LoadBatchEvent(batch_name=batch_name, message=msg) _logger.info(msg) event.save() except Exception as e: msg = "unable to load batch: %s" % e _logger.error(msg) _logger.exception(e) event = LoadBatchEvent(batch_name=batch_name, message=msg) event.save() try: self.purge_batch(batch_name) except Exception as pbe: _logger.error("purge batch failed for failed load batch: %s" % pbe) _logger.exception(pbe) raise BatchLoaderException(msg) # updates the min and max years of all titles set_fulltext_range() return batch
if not batch_source.endswith("/"): batch_source += "/" batch_name = _normalize_batch_name(batch_name) try: batch = Batch.objects.get(name=batch_name) _logger.info("Batch already loaded: %s" % batch_name) return batch except Batch.DoesNotExist, e: pass _logger.info("loading batch: %s" % batch_name) t0 = time() times = [] event = LoadBatchEvent(batch_name=batch_name, message="starting load") event.save() batch = None try: # build a Batch object for the batch location batch = self._get_batch(batch_name, batch_source, create=True) self._sanity_check_batch(batch) # stash it away for processing later on self.current_batch = batch # parse the batch.xml and load up each issue mets file doc = etree.parse(batch.validated_batch_url) for e in doc.xpath('ndnp:reel', namespaces=ns):