Beispiel #1
0
    def load_batch(self, batch_path):
        """Load a batch, and return a Batch instance for the batch
        that was loaded.

          loader.load_batch('/path/to/batch_curiv_ahwahnee_ver01')

        """
        self.pages_processed = 0

        # Trailing slash breaks comparison to link_name below, so strip off
        batch_path = batch_path.rstrip("/")

        _logger.info("loading batch at %s", batch_path)
        dirname, batch_name = os.path.split(batch_path)
        if dirname:
            batch_source = None
            link_name = os.path.join(settings.BATCH_STORAGE, batch_name)

            # Create symlink if paths don't match, symlink not already there,
            # and batch_path wasn't input with a BATCH_STORAGE symlink path
            if (batch_path != link_name and not os.path.islink(link_name)
                    and not (os.path.islink(settings.BATCH_STORAGE)
                             and batch_path.startswith(
                                 os.path.realpath(settings.BATCH_STORAGE)))):
                _logger.info("creating symlink %s -> %s", batch_path,
                             link_name)
                os.symlink(batch_path, link_name)
        else:
            batch_source = urllib.parse.urljoin(settings.BATCH_STORAGE,
                                                batch_name)
            if not batch_source.endswith("/"):
                batch_source += "/"

        batch_name = _normalize_batch_name(batch_name)
        try:
            batch = Batch.objects.get(name=batch_name)
            _logger.info("Batch already loaded: %s" % batch_name)
            return batch
        except Batch.DoesNotExist as e:
            pass

        _logger.info("loading batch: %s" % batch_name)
        t0 = time()
        times = []

        event = LoadBatchEvent(batch_name=batch_name, message="starting load")
        event.save()

        batch = None
        try:
            # build a Batch object for the batch location
            batch = self._get_batch(batch_name, batch_source, create=True)
            self._sanity_check_batch(batch)

            # stash it away for processing later on
            self.current_batch = batch

            # parse the batch.xml and load up each issue mets file
            doc = etree.parse(batch.validated_batch_url)

            for e in doc.xpath('ndnp:reel', namespaces=ns):

                reel_number = e.attrib['reelNumber'].strip()

                try:
                    reel = models.Reel.objects.get(number=reel_number,
                                                   batch=batch)
                except models.Reel.DoesNotExist as e:
                    reel = models.Reel(number=reel_number, batch=batch)
                    reel.save()

            for e in doc.xpath('ndnp:issue', namespaces=ns):
                mets_url = urllib.parse.urljoin(batch.storage_url, e.text)
                try:
                    issue = self._load_issue(mets_url)
                except ValueError as e:
                    _logger.exception(e)
                    continue
                reset_queries()
                times.append((time() - t0, self.pages_processed))

            # commit new changes to the solr index, if we are indexing
            if self.PROCESS_OCR:
                self.solr.commit()

            batch.save()
            msg = "processed %s pages" % batch.page_count
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            _logger.info(msg)
            event.save()
        except Exception as e:
            msg = "unable to load batch: %s" % e
            _logger.error(msg)
            _logger.exception(e)
            event = LoadBatchEvent(batch_name=batch_name, message=msg)
            event.save()
            try:
                self.purge_batch(batch_name)
            except Exception as pbe:
                _logger.error("purge batch failed for failed load batch: %s" %
                              pbe)
                _logger.exception(pbe)
            raise BatchLoaderException(msg)

        # updates the min and max years of all titles
        set_fulltext_range()
        return batch
Beispiel #2
0
    def _load_page(self, doc, div, issue):
        dmdid = div.attrib['DMDID']
        mods = dmd_mods(doc, dmdid)
        page = Page()

        seq_string = mods.xpath('string(.//mods:extent/mods:start)',
                                namespaces=ns)
        try:
            page.sequence = int(seq_string)
        except ValueError as e:
            raise BatchLoaderException(
                "could not determine sequence number for page from '%s'" %
                seq_string)
        page.number = mods.xpath('string(.//mods:detail[@type="page number"])',
                                 namespaces=ns).strip()

        reel_number = mods.xpath(
            'string(.//mods:identifier[@type="reel number"])',
            namespaces=ns).strip()
        try:
            reel = models.Reel.objects.get(number=reel_number,
                                           batch=self.current_batch)
            page.reel = reel
        except models.Reel.DoesNotExist as e:
            if reel_number:
                reel = models.Reel(number=reel_number,
                                   batch=self.current_batch,
                                   implicit=True)
                reel.save()
                page.reel = reel
            else:
                _logger.warn("unable to find reel number in page metadata")

        _logger.info("Assigned page sequence: %s" % page.sequence)

        _section_dmdid = div.xpath(
            'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)',
            namespaces=ns)
        if _section_dmdid:
            section_mods = dmd_mods(doc, _section_dmdid)
            section_label = section_mods.xpath(
                'string(.//mods:detail[@type="section label"]/mods:number[1])',
                namespaces=ns).strip()
            if section_label:
                page.section_label = section_label

        page.issue = issue

        _logger.info("Saving page. issue date: %s, page sequence: %s" %
                     (issue.date_issued, page.sequence))

        # TODO - consider the possibility of executing the file name
        #        assignments (below) before this page.save().
        page.save()

        notes = []
        for mods_note in mods.xpath('.//mods:note', namespaces=ns):
            type = mods_note.xpath('string(./@type)')
            label = mods_note.xpath('string(./@displayLabel)')
            text = mods_note.xpath('string(.)').strip()
            note = models.PageNote(type=type, label=label, text=text)
            notes.append(note)
        page.notes.set(notes, bulk=False)

        # there's a level indirection between the METS structmap and the
        # details about specific files in this package ...
        # so we have to first get the FILEID from the issue div in the
        # structmap and then use it to look up the file details in the
        # larger document.

        for fptr in div.xpath('./mets:fptr', namespaces=ns):
            file_id = fptr.attrib['FILEID']
            file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id,
                                namespaces=ns)[0]
            file_type = file_el.attrib['USE']

            # get the filename relative to the storage location
            file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)',
                                      namespaces=ns)
            file_name = urllib.parse.urljoin(doc.docinfo.URL, file_name)
            file_name = self.storage_relative_path(file_name)

            if file_type == 'master':
                page.tiff_filename = file_name
            elif file_type == 'service':
                page.jp2_filename = file_name
                try:
                    # extract image dimensions from technical metadata for jp2
                    for admid in file_el.attrib['ADMID'].split(' '):
                        length, width = get_dimensions(doc, admid)
                        if length and width:
                            page.jp2_width = width
                            page.jp2_length = length
                            break
                except KeyError as e:
                    _logger.info(
                        "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder..."
                        % (page.issue, page))
                    im = Image.open(page.jp2_abs_filename)
                    page.jp2_width, page.jp2_length = im.size

                if not page.jp2_width:
                    raise BatchLoaderException(
                        "No jp2 width for issue: %s page: %s" %
                        (page.issue, page))
                if not page.jp2_length:
                    raise BatchLoaderException(
                        "No jp2 length for issue: %s page: %s" %
                        (page.issue, page))
            elif file_type == 'derivative':
                page.pdf_filename = file_name
            elif file_type == 'ocr':
                page.ocr_filename = file_name

        if page.ocr_filename:
            # don't incurr overhead of extracting ocr text, word coordinates
            # and indexing unless the batch loader has been set up to do it
            if self.PROCESS_OCR:
                self.process_ocr(page)
        else:
            _logger.info("No ocr filename for issue: %s page: %s" %
                         (page.issue, page))

        _logger.debug("saving page: %s" % page.url)
        page.save()
        return page
Beispiel #3
0
            # stash it away for processing later on
            self.current_batch = batch

            # parse the batch.xml and load up each issue mets file
            doc = etree.parse(batch.validated_batch_url)

            for e in doc.xpath('ndnp:reel', namespaces=ns):

                reel_number = e.attrib['reelNumber'].strip()

                try:
                    reel = models.Reel.objects.get(number=reel_number,
                                                   batch=batch)
                except models.Reel.DoesNotExist, e:
                    reel = models.Reel(number=reel_number, batch=batch)
                    reel.save()

            for e in doc.xpath('ndnp:issue', namespaces=ns):
                mets_url = urlparse.urljoin(batch.storage_url, e.text)
                try:
                    issue = self._load_issue(mets_url)
                except ValueError, e:
                    _logger.exception(e)
                    continue
                reset_queries()
                times.append((time() - t0, self.pages_processed))

            # commit new changes to the solr index, if we are indexing
            if self.PROCESS_OCR:
                self.solr.commit()