def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath( 'string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError, e: raise BatchLoaderException("could not determine sequence number for page from '%s'" % seq_string)
def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath('string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError as e: raise BatchLoaderException( "could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath('string(.//mods:detail[@type="page number"])', namespaces=ns).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist as e: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: _logger.warn("unable to find reel number in page metadata") _logger.info("Assigned page sequence: %s" % page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue _logger.info("Saving page. issue date: %s, page sequence: %s" % (issue.date_issued, page.sequence)) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes.set(notes, bulk=False) # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE'] # get the filename relative to the storage location file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)', namespaces=ns) file_name = urllib.parse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == 'master': page.tiff_filename = file_name elif file_type == 'service': page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib['ADMID'].split(' '): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError as e: _logger.info( "Could not determine dimensions of jp2 for issue: %s page: %s... trying harder..." % (page.issue, page)) im = Image.open(page.jp2_abs_filename) page.jp2_width, page.jp2_length = im.size if not page.jp2_width: raise BatchLoaderException( "No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException( "No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == 'derivative': page.pdf_filename = file_name elif file_type == 'ocr': page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: self.process_ocr(page) else: _logger.info("No ocr filename for issue: %s page: %s" % (page.issue, page)) _logger.debug("saving page: %s" % page.url) page.save() return page