def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath( 'string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError, e: raise BatchLoaderException("could not determine sequence number for page from '%s'" % seq_string)
def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath( 'string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError, e: raise BatchLoaderException("could not determine sequence number for page from '%s'" % seq_string)
def handle(self, key, **options): LOGGER.debug("looking for chronam page content on flickr") create_count = 0 for flickr_url, chronam_url in flickr_chronam_links(key): self.stdout.write("found flickr/chronam link: %s, %s" % (flickr_url, chronam_url)) # use the page url to locate the Page model path = urlparse(chronam_url).path page = Page.lookup(path) if not page: self.stderr.write("page for %s not found" % chronam_url) continue # create the FlickrUrl attached to the apprpriate page f, created = FlickrUrl.objects.get_or_create(value=flickr_url, page=page) if created: create_count += 1 f.save() self.stdout.write("updated page (%s) with flickr url (%s)" % (page, flickr_url)) else: self.stdout.write("already knew about %s" % flickr_url) self.stdout.write("created %s flickr urls" % create_count)
def handle(self, key, **options): LOGGER.debug("looking for chronam page content on flickr") create_count = 0 for flickr_url, chronam_url in flickr_chronam_links(key): self.stdout.write("found flickr/chronam link: %s, %s" % (flickr_url, chronam_url)) # use the page url to locate the Page model path = urlparse(chronam_url).path page = Page.lookup(path) if not page: self.stderr.write("page for %s not found" % chronam_url) continue # create the FlickrUrl attached to the apprpriate page f, created = FlickrUrl.objects.get_or_create(value=flickr_url, page=page) if created: create_count += 1 f.save() self.stdout.write("updated page (%s) with flickr url (%s)" % (page, flickr_url)) else: self.stdout.write("already knew about %s" % flickr_url) self.stdout.write("created %s flickr urls" % create_count)
def _load_page(self, doc, div, issue): dmdid = div.attrib['DMDID'] mods = dmd_mods(doc, dmdid) page = Page() seq_string = mods.xpath( 'string(.//mods:extent/mods:start)', namespaces=ns) try: page.sequence = int(seq_string) except ValueError: raise BatchLoaderException("could not determine sequence number for page from '%s'" % seq_string) page.number = mods.xpath( 'string(.//mods:detail[@type="page number"])', namespaces=ns ).strip() reel_number = mods.xpath( 'string(.//mods:identifier[@type="reel number"])', namespaces=ns ).strip() try: reel = models.Reel.objects.get(number=reel_number, batch=self.current_batch) page.reel = reel except models.Reel.DoesNotExist: if reel_number: reel = models.Reel(number=reel_number, batch=self.current_batch, implicit=True) reel.save() page.reel = reel else: LOGGER.warn("unable to find reel number in page metadata") LOGGER.info("Assigned page sequence: %s", page.sequence) _section_dmdid = div.xpath( 'string(ancestor::mets:div[@TYPE="np:section"]/@DMDID)', namespaces=ns) if _section_dmdid: section_mods = dmd_mods(doc, _section_dmdid) section_label = section_mods.xpath( 'string(.//mods:detail[@type="section label"]/mods:number[1])', namespaces=ns).strip() if section_label: page.section_label = section_label page.issue = issue LOGGER.info("Saving page. issue date: %s, page sequence: %s", issue.date_issued, page.sequence) # TODO - consider the possibility of executing the file name # assignments (below) before this page.save(). page.save() notes = [] for mods_note in mods.xpath('.//mods:note', namespaces=ns): type = mods_note.xpath('string(./@type)') label = mods_note.xpath('string(./@displayLabel)') text = mods_note.xpath('string(.)').strip() note = models.PageNote(type=type, label=label, text=text) notes.append(note) page.notes = notes # there's a level indirection between the METS structmap and the # details about specific files in this package ... # so we have to first get the FILEID from the issue div in the # structmap and then use it to look up the file details in the # larger document. for fptr in div.xpath('./mets:fptr', namespaces=ns): file_id = fptr.attrib['FILEID'] file_el = doc.xpath('.//mets:file[@ID="%s"]' % file_id, namespaces=ns)[0] file_type = file_el.attrib['USE'] # get the filename relative to the storage location file_name = file_el.xpath('string(./mets:FLocat/@xlink:href)', namespaces=ns) file_name = urlparse.urljoin(doc.docinfo.URL, file_name) file_name = self.storage_relative_path(file_name) if file_type == 'master': page.tiff_filename = file_name elif file_type == 'service': page.jp2_filename = file_name try: # extract image dimensions from technical metadata for jp2 for admid in file_el.attrib['ADMID'].split(' '): length, width = get_dimensions(doc, admid) if length and width: page.jp2_width = width page.jp2_length = length break except KeyError: LOGGER.info("Could not determine dimensions of jp2 for issue: %s page: %s... trying harder...", page.issue, page) if not page.jp2_width: raise BatchLoaderException("No jp2 width for issue: %s page: %s" % (page.issue, page)) if not page.jp2_length: raise BatchLoaderException("No jp2 length for issue: %s page: %s" % (page.issue, page)) elif file_type == 'derivative': page.pdf_filename = file_name elif file_type == 'ocr': page.ocr_filename = file_name if page.ocr_filename: # don't incurr overhead of extracting ocr text, word coordinates # and indexing unless the batch loader has been set up to do it if self.PROCESS_OCR: page = self.process_ocr(page) else: LOGGER.info("No ocr filename for issue: %s page: %s", page.issue, page) LOGGER.debug("saving page: %s", page.url) page.save() return page