page.issue = issue _logger.info("Saving page. issue date: %s, page sequence: %s" % (issue.date_issued, page.sequence)) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes = notes # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE']
def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath( 'string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError: raise BatchLoaderException("could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath( 'string(.//mods:detail[@type="page number"])', namespaces=ns ).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns ).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: LOGGER.warn("unable to find reel number in page metadata") LOGGER.info("Assigned page sequence: %s", page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue LOGGER.info("Saving page. issue date: %s, page sequence: %s", issue.date_issued, page.sequence) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes = notes # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE'] # get the filename relative to the storage location file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)', namespaces=ns) file_name = urlparse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == 'master': page.tiff_filename = file_name elif file_type == 'service': page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib['ADMID'].split(' '): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError: LOGGER.info("Could not determine dimensions of jp2 for issue: %s page: %s... trying harder...", page.issue, page) if not page.jp2_width: raise BatchLoaderException("No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException("No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == 'derivative': page.pdf_filename = file_name elif file_type == 'ocr': page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: page = self.process_ocr(page) else: LOGGER.info("No ocr filename for issue: %s page: %s", page.issue, page) LOGGER.debug("saving page: %s", page.url) page.save() return page