def load_alma_bib_record(kdip): """ Bib record from Alma. """ if isinstance(kdip, basestring): kdip = models.KDip.objects.get(kdip_id=kdip) item = requests.get('%sitems' % settings.ALMA_API_ROOT, params={ 'item_barcode': kdip.kdip_id, 'apikey': settings.ALMA_APIKEY } ) bib_rec = item.text.encode('utf-8').strip() item_obj = load_xmlobject_from_string(bib_rec, models.AlmaBibItem) kdip.mms_id = item_obj.mms_id kdip.save() bib = requests.get('%sbibs/%s' % (settings.ALMA_API_ROOT, kdip.mms_id), params={'apikey': settings.ALMA_APIKEY} ) bib_xml = bib.text.encode('utf-8').strip() return load_xmlobject_from_string(bib_xml, models.AlmaBibRecord)
def query(self, xquery=None, start=1, how_many=10, cache=False, session=None, release=None, result_type=None): """Execute an XQuery query, returning the results directly. :param xquery: a string XQuery query :param start: first index to return (1-based) :param how_many: maximum number of items to return :param cache: boolean, to cache a query and return a session id (optional) :param session: session id, to retrieve a cached session (optional) :param release: session id to be released (optional) :rtype: the resultType specified at the creation of this ExistDB; defaults to :class:`QueryResult`. """ # xml_s = self.server.query(xquery, how_many, start, kwargs) params = { '_howmany': how_many, '_start': start, } if xquery is not None: params['_query'] = xquery if cache: params['_cache'] = 'yes' if release is not None: params['_release'] = release if session is not None: params['_session'] = session if result_type is None: result_type = self.resultType opts = ' '.join('%s=%s' % (key.lstrip('_'), val) for key, val in params.iteritems() if key != '_query') if xquery: debug_query = '\n%s' % xquery else: debug_query = '' logger.debug('query %s%s' % (opts, debug_query)) response = self.session.get(self.restapi_path(''), params=params, stream=False) if response.status_code == requests.codes.ok: # successful release doesn't return any content if release is not None: return True # successfully released # TODO: test unicode handling return xmlmap.load_xmlobject_from_string(response.content, result_type) # 400 bad request returns an xml error we can parse elif response.status_code == requests.codes.bad_request: err = xmlmap.load_xmlobject_from_string(response.content, ExistExceptionResponse) raise ExistDBException(err.message) # not sure if any information is available on other error codes else: raise ExistDBException(response.content)
def test_index_data(self): loaded = load_xmlobject_from_string(SAMPLE_MODS, mods.Mods) index_data = loaded.index_data() self.assertEqual(index_data['abstract'], [u'Poétry description...']) self.assertEqual(index_data['contributor_display'], ['Smith, Tom, 1803 or 4-1860 (creator)', 'Baker, Jim, 1718-1762 (director)', 'Wilson, Jane', 'Brown University. English (sponsor)']) self.assertEqual(index_data['copyrightDate'], '2008-01-01T00:00:00Z') self.assertEqual(index_data['dateCreated'], '2008-02-03T00:00:00Z') self.assertEqual(index_data['dateModified'], '2008-05-06T00:00:00Z') self.assertEqual(index_data['dateModified_ssim'], ['2008-06-07-2009-01-02', 'invalid date', '2008-06-07']) self.assertEqual(index_data['genre'], [u'aat theses', u'bdr theses', u'local theses']) self.assertEqual(index_data['mods_genre_aat_ssim'], [u'aat theses']) self.assertEqual(index_data['mods_genre_bdr_ssim'], [u'bdr theses']) self.assertEqual(index_data['mods_genre_local_ssim'], [u'local theses']) self.assertEqual(index_data['mods_access_condition_logo_ssim'], [u'http://i.creativecommons.org/p/zero/1.0/88x31.png']) self.assertEqual(index_data['mods_access_condition_use_text_tsim'], [u'To the extent possible under law, the person who associated CC0 with this work has waived all copyright and related or neighboring rights to this work.']) self.assertEqual(index_data['mods_access_condition_use_link_ssim'], [u'http://creativecommons.org/publicdomain/zero/1.0/']) self.assertEqual(index_data['mods_id'], 'id101') self.assertEqual(index_data['mods_id_test_type_ssim'], ['Test type id']) self.assertEqual(index_data['mods_note_random_type_ssim'], [u'random type note']) self.assertEqual(index_data['mods_note_display_label_ssim'], [u'display label note']) self.assertEqual(index_data['mods_title_alt'], [u'alternative title']) self.assertEqual(index_data['name'], ['Smith, Tom', 'Baker, Jim', 'Wilson, Jane', 'Brown University. English']) self.assertEqual(index_data['note'], [u'Thésis (Ph.D.)', u'discarded: random type note', u'Short: Without ending', u'Display @#$label? display label note']) self.assertEqual(index_data['other_title'], [u'Other title']) self.assertEqual(index_data['primary_title'], u'Poétry') self.assertEqual(index_data['keyword'], [u'Display Labél! modernism', u'metalepsis', u'Display Label: Yeats', u'Stevens', u'Merrill', u'Eliot', u"label missing colon: post modernism"]) self.assertEqual(index_data['mods_subject_ssim'], [u'Display Labél! modernism', u'metalepsis', u'Display Label: Yeats', u'Stevens', u'Merrill', u'Eliot', u"label missing colon: post modernism"]) self.assertEqual(index_data['mods_subject_display_label_ssim'], [u'modernism', u'Yeats']) self.assertEqual(index_data['mods_subject_local_ssim'], [u'Stevens', u'Eliot'])
def test_geographic_subjects(self): loaded = load_xmlobject_from_string(SAMPLE_MODS, mods.Mods) subject = [s for s in loaded.subjects if s.hierarchical_geographic][0] self.assertEqual(subject.hierarchical_geographic.country, 'United States') self.assertEqual(subject.hierarchical_geographic.state, 'Louisiana') self.assertEqual(subject.hierarchical_geographic.city, 'New Orleans') self.assertEqual(subject.hierarchical_geographic.city_section, 'Lower Ninth Ward')
def test_main(self): self.fox.pid = 'sample:123' fox_string = self.fox.serialize() #round trip to make sure we have what we want. read_fox = load_xmlobject_from_string(fox_string, Fox) self.assertEqual('sample:123', read_fox.pid)
def test_isvalid(self): # if additions to MODS test fixture cause validation errors, uncomment the next 2 lines to debug #self.mods.is_valid() #print self.mods.validation_errors() self.assertTrue(self.mods.is_valid()) invalid_mods = load_xmlobject_from_string(self.invalid_xml, mods.MODS) self.assertFalse(invalid_mods.is_valid())
def process_article(self, pid, symp_pub, options): self.output(1,"Processing Article %s" % pid) # put article xml url = '%s/%s' % (self.pub_create_url, pid) status = None if symp_pub.is_empty(): self.output(1,"Skipping becase XML is empty") self.counts['skipped']+=1 return valid = symp_pub.is_valid() self.output(2,"XML valid: %s" % valid) if not valid: self.output(0, "Error publication xml is not valid for pid %s %s" % (pid, symp_pub.validation_errors())) self.counts['errors']+=1 return if not options['noact']: response = self.session.put(url, data=symp_pub.serialize()) status = response.status_code self.output(2,"PUT %s %s" % (url, status if status else "<NO ACT>")) self.output(2, "=====================================================================") self.output(2, symp_pub.serialize(pretty=True).decode('utf-8', 'replace')) self.output(2,"---------------------------------------------------------------------") if status and status not in [200, 201]: self.output(0,"Error publication PUT returned code %s for %s" % (status, pid)) self.counts['errors']+=1 return elif not options['noact']: # checkd for warnings for w in load_xmlobject_from_string(response.raw.read(), OESympImportPublication).warnings: self.output(0, 'Warning: %s %s' % (pid, w.message)) self.counts['warnings']+=1 self.counts['articles_processed']+=1
def test_round_trip(self): self.mods.title = "Sample title" self.mods.publisher = "BUL" mods_str = self.mods.serialize(pretty=False) loaded = load_xmlobject_from_string(mods_str, Mods) self.assertEqual(loaded.title, 'Sample title') self.assertEqual(loaded.publisher, 'BUL')
def test_subjects(self): self.mods.title = "Sample" topics = ['sample', 'test'] for keyword in topics: self.mods.subjects.append(mods.Subject(topic=keyword)) new_mods = load_xmlobject_from_string(self.mods.serialize(), mods.Mods) self.assertEqual(topics, [s.topic for s in new_mods.subjects])
def test_load_sample_mods(self): loaded = load_xmlobject_from_string(SAMPLE_MODS, mods.Mods) self.assertEqual(loaded.id, 'id101') self.assertEqual(loaded.title, 'Poétry\n Title') self.assertEqual(loaded.title_info[1].title, 'Other title') self.assertEqual(loaded.title_info[2].title, 'alternative title') self.assertEqual(loaded.title_info[2].type, 'alternative') self.assertEqual(loaded.title_info[2].label, 'First line') self.assertEqual(loaded.origin_info.label, 'date added') self.assertEqual(loaded.origin_info.places[0].place_terms[0].text, 'USA') self.assertEqual(loaded.origin_info.places[0].place_terms[0].authority, 'auth') self.assertEqual(loaded.origin_info.places[0].place_terms[0].authority_uri, 'http://auth.com') self.assertEqual(loaded.origin_info.places[0].place_terms[0].value_uri, 'http://auth.com/usa') #test names personal_names = [name.name_parts[0].text for name in loaded.names if name.type == 'personal' and name.name_parts[0].text] self.assertEqual(len(personal_names), 3) personal_name_list = ['Smith, Tom', 'Baker, Jim', 'Wilson, Jane'] for i in range(3): self.assertTrue(personal_names[i] in personal_name_list) corporate_names = [name.name_parts[0].text for name in loaded.names if name.type == 'corporate'] corporate_name_list = ['Brown University. English', 'Providence, RI'] self.assertEqual(corporate_names, corporate_name_list) tom_smith = [name for name in loaded.names if name.name_parts[0].text == 'Smith, Tom'][0] self.assertEqual(tom_smith.authority, 'fast') self.assertEqual(tom_smith.authority_uri, 'http://fast.com') self.assertEqual(tom_smith.value_uri, 'http://fast.com/1') self.assertEqual(tom_smith.roles[0].authority, 'marcrelator') self.assertEqual(tom_smith.roles[0].authority_uri, 'http://id.loc.gov/vocabulary/relators') self.assertEqual(tom_smith.roles[0].value_uri, 'http://id.loc.gov/vocabulary/relators/cre') self.assertEqual(loaded.resource_type, 'text') self.assertEqual(loaded.genres[1].text, 'aat theses') self.assertEqual(loaded.genres[4].text, '123') self.assertEqual(loaded.genres[4].authority, 'fast') self.assertEqual(loaded.genres[4].authority_uri, 'http://fast.com') self.assertEqual(loaded.genres[4].value_uri, 'http://fast.com/123') s = [s for s in loaded.subjects if s.topic == '456'][0] self.assertEqual(s.authority, 'fast') self.assertEqual(s.authority_uri, 'http://fast.com') self.assertEqual(s.value_uri, 'http://fast.com/456') self.assertEqual(loaded.notes[0].text, 'Thésis (Ph.D.)') self.assertEqual(loaded.target_audiences[0].text, 'Target Audience') self.assertEqual(loaded.target_audiences[0].authority, 'local') self.assertEqual(loaded.physical_description.extent, 'viii, 208 p.') self.assertEqual(loaded.physical_description.digital_origin, 'born digital') self.assertEqual(loaded.physical_description.note, 'note 1') self.assertEqual(loaded.classifications[0].text, 'Some classification') self.assertEqual(loaded.classifications[0].label, 'Test classification') self.assertEqual(loaded.classifications[0].authority, 'classauth') self.assertEqual(loaded.classifications[0].authority_uri, 'http://classauth.com') self.assertEqual(loaded.classifications[0].value_uri, 'http://classauth.com/some') self.assertEqual(loaded.locations[0].physical.text, 'Random location') self.assertEqual(loaded.locations[0].physical.authority, 'locauth') self.assertEqual(loaded.locations[0].physical.authority_uri, 'http://locauth.com') self.assertEqual(loaded.locations[0].physical.value_uri, 'http://locauth.com/random') self.assertEqual(loaded.locations[0].holding_simple.copy_information[0].notes[0].text, 'location note') self.assertEqual(loaded.related_items[1].label, 'location of original') self.assertEqual(loaded.related_items[1].classifications[0].text, 'Classification')
def test_setting_xlink_href(self): access_condition = mods.AccessCondition(text='access condition') access_condition.node.set('{%s}href' % mods.XLINK_NAMESPACE, 'http://example.com') self.mods.access_conditions.append(access_condition) mods_str = self.mods.serialize(pretty=False) loaded = load_xmlobject_from_string(mods_str, mods.Mods) xlink_href = loaded.access_conditions[0].node.get('{%s}href' % mods.XLINK_NAMESPACE) self.assertEqual(xlink_href, 'http://example.com')
def loadFixtureData(self, fname): data = load_fixture_data(fname) # if pidspace is specified, get a new pid from fedora and set it as the pid in the xml if hasattr(self, "pidspace"): xml = xmlmap.load_xmlobject_from_string(data, _MinimalFoxml) xml.pid = self.getNextPid() return xml.serialize() else: return data
def test_multiContextAndHolder(self): self.init_context("rights1") self.init_context("rights2") self.init_context("rights3") self.init_holder() rights_str = self.rights.serialize(pretty=True) loaded = load_xmlobject_from_string(rights_str, Rights) self.assertEqual(len(loaded.ctext), 3) self.assertEqual(loaded.holder.context_ids, "rights1 rights2 rights3")
def from_string(cls, xml_string, validate=True): """ Creates a Python object from a XML string :param xml_string: XML string :param validate: XML should be validated against the embedded XSD definition :type validate: Boolean :returns: the Python object """ return xmlmap.load_xmlobject_from_string(xml_string, xmlclass=cls, validate=validate)
def test_subjects(self): self.mods.title = "Sample" local = ['sample', 'test'] for keyword in local: subject = LocalTopic() subject.topic = keyword self.mods.local_topic.append(subject) new_mods = load_xmlobject_from_string(self.mods.serialize(), Mods) self.assertEqual(local, [n.topic for n in new_mods.local_topic])
def test_update_instance(self): # initialize data the same way a view processing a POST would update_form = TestForm(self.post_data, instance=self.testobj) # check that form is valid - if no errors, this populates cleaned_data self.assertTrue(update_form.is_valid()) instance = update_form.update_instance() self.assert_(isinstance(instance, TestObject)) self.assertEqual(21, instance.int) self.assertEqual(False, instance.bool) self.assertEqual('b', instance.id) self.assertEqual('completely new text content', instance.longtext) self.assertEqual(0, instance.other_child.val) # spot check that values were set properly in the xml xml = instance.serialize() self.assert_('id="b"' in xml) self.assert_('<boolean>no</boolean>' in xml) # test save on form with no pre-existing xmlobject instance class SimpleForm(XmlObjectForm): class Meta: model = TestObject fields = ['id', 'bool', 'longtext'] # fields with simple, top-level xpaths # creation for nested node not yet supported in xmlmap - excluding int exclude = ['child'] # exclude subform to simplify testing new_form = SimpleForm({'id': 'A1', 'bool': True, 'longtext': 'la-di-dah'}) self.assertTrue(new_form.is_valid()) instance = new_form.update_instance() self.assert_(isinstance(instance, TestObject), "update_instance on unbound xmlobjectform returns correct xmlobject instance") self.assertEqual(True, instance.bool) self.assertEqual('A1', instance.id) self.assertEqual('la-di-dah', instance.longtext) # spot check values in created-from-scratch xml xml = instance.serialize() self.assert_('id="A1"' in xml) self.assert_('<boolean>yes</boolean>' in xml) # formset deletion data = self.post_data.copy() # update post data to test deleting items data.update({ 'children-INITIAL_FORMS': 4, # only initial forms can be deleted 'children-0-DELETE': True, 'children-2-DELETE': True, }) # make a copy object, since the instance will be updated by the form testobj = xmlmap.load_xmlobject_from_string(self.testobj.serialize(), TestObject) update_form = TestForm(data, instance=self.testobj) # check that form is valid - if no errors, this populates cleaned_data self.assertTrue(update_form.is_valid()) instance = update_form.update_instance() # children 0 and 2 should be removed from the updated instance self.assert_(testobj.children[0] not in instance.children) self.assert_(testobj.children[2] not in instance.children)
def generate_tei(self, ocrpage): '''Generate TEI facsimile for the current page''' try: result = ocrpage.xsl_transform(filename=self.ocr_to_teifacsimile_xsl, return_type=unicode, **self.tei_options) # returns _XSLTResultTree, which is not JSON serializable; return xmlmap.load_xmlobject_from_string(result, tei.Facsimile) except etree.XMLSyntaxError: logger.warn('OCR xml for %s is invalid', self.pid)
def test_isvalid(self): self.assertTrue(self.dc.is_valid()) invalid = """<oai_dc:dc xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/"> <dc:title>Feet in the Fire</dc:title> <not_a_dc_field>bogus</not_a_dc_field> </oai_dc:dc> """ invalid_dc = load_xmlobject_from_string(invalid, DublinCore) self.assertFalse(invalid_dc.is_valid())
def test_pageV1_0(self): # page 1.0 - abbyy ocr content page = PageV1_0( Mock()) # use mock for fedora api, since we won't make any calls page.pid = 'rdxtest:4607' page.page_order = 5 # vol = VolumeV1_0(Mock()) with patch('readux.books.models.PageV1_0.volume') as mockvolume: mockvolume.uriref = rdflib.URIRef('vol:1') mockvolume.display_label = 'Mabel Meredith' mockvolume.volume = None mockvolume.creator = ['Townley, Arthur'] mockvolume.date = '1863' # update fixture xml with ids with open(VolumeV1_0.ocr_add_ids_xsl) as xslfile: result = self.fr6v1.xsl_transform(filename=xslfile, return_type=unicode) fr6v1_with_ids = load_xmlobject_from_string( result, abbyyocr.Document) # use the first page with substantial text content as input ocr_page = fr6v1_with_ids.pages[5] teipage = page.generate_tei(ocr_page) # NOTE: uncomment to see generated TEI # print tei.serialize() # should be generating valid tei # if not tei.schema_valid(): # print tei.schema_validation_errors() self.assertTrue(teipage.schema_valid(), 'generated TEI facsimile should be schema-valid') # inspect the tei and check for expected values # - page identifier based on page_order value passed in self.assertEqual(ocr_page.id, teipage.page.id, 'tei id should be carried through from ocr xml') self.assertEqual(page.display_label, teipage.title, 'tei title should be set from page diplay label') # distributor not mapped in teimap, so just use xpath to check self.assertEqual( settings.TEI_DISTRIBUTOR, teipage.node.xpath('string(//t:publicationStmt/t:distributor)', namespaces={'t': teipage.ROOT_NS}), 'configured tei distributor should be set in publication statement' ) # recognized as abbyy input self.assert_('Abbyy file' in teipage.header.source_description, 'input should be recognized as Abbyy ocr') # brief bibliographic data self.assert_( mockvolume.display_label in teipage.header.source_description) self.assert_( mockvolume.creator[0] in teipage.header.source_description) self.assert_(mockvolume.date in teipage.header.source_description)
def test_WriteRead(self): self.ir.depositor_name = "Johnny" self.ir.depositor_email = "*****@*****.**" self.ir.date = "2012-05-31" self.ir.filename = "Multiple files" self.ir.collections_date = "2012-05-31" self.ir.collection = '598' ir_str = self.ir.serializeDocument(pretty=True) loaded = load_xmlobject_from_string(ir_str, IR) self.assertEqual(loaded.collection, '598')
def process_relations(self, pid, relations, options): self.output(1, "Processing Relationss for %s" % pid) # put relationship xml url = self.relation_create_url status = None for r in relations: self.output(0, "%s %s" % (r.from_object, r.to_object)) status = None valid = r.is_valid() self.output(2, "XML valid: %s" % valid) if not valid: self.output( 0, "Error because a relation xml is not valid for pid %s %s" % (pid, r.validation_errors())) self.counts['errors'] += 1 continue if not options['noact']: response = self.session.post(self.relation_create_url, data=r.serialize()) status = response.status_code self.output(2, "POST %s %s" % (url, status if status else "<NO ACT>")) self.output(2, r.serialize(pretty=True)) self.output( 2, "---------------------------------------------------------------------" ) self.output( 2, "=====================================================================" ) if status and status not in [200, 201]: self.output( 0, "Error relation POST returned code %s for %s" % (status, pid)) self.counts['errors'] += 1 return elif not options['noact']: # checkd for warnings try: for w in load_xmlobject_from_string( response.raw.read(), OESympImportArticle).warnings: self.output(0, 'Warning: %s %s' % (pid, w.message)) self.counts['warnings'] += 1 except: self.output( 0, "Trouble reding warnings for relation record in %s" % pid) self.counts['relations_processed'] += 1
def test_index_title_parts(self): loaded = load_xmlobject_from_string(SAMPLE_MODS, mods.Mods) primary_title = loaded.title_info_list[0] primary_title.subtitle = "Primary Subtitle" primary_title.part_name = "Primary Part 1" primary_title.part_number = "4" primary_title.non_sort = "The" index_data = loaded.index_data() self.assertEqual(index_data['subtitle'], u'Primary Subtitle') self.assertEqual(index_data['partnumber'], u'4') self.assertEqual(index_data['partname'], u'Primary Part 1') self.assertEqual(index_data['nonsort'], u'The')
def test_relsIsMemberOf(self): r = RelsExt() r.about = 'info:fedora/test:124' ## add MemberOf to RelsExt mo = MemberOf() mo.name = 'info:fedora/test:master' r.is_member_of.append(mo) ## add RelsExt to fox-object self.fox.rels_ext = r ## test after round-trip fox_object = load_xmlobject_from_string( self.fox.serialize(), Fox ) self.assertTrue( 'info:fedora/test:master' == fox_object.rels_ext.is_member_of[0].name ) self.assertTrue( '<rel:isMemberOf rdf:resource="info:fedora/test:master"/>' in fox_object.serialize() )
def add_ocr_ids(self, regenerate_ids=False): 'Update OCR xml with ids for pages, blocks, lines, etc' with open(self.ocr_add_ids_xsl) as xslfile: try: result = self.ocr.content.xsl_transform(filename=xslfile, return_type=unicode, id_prefix='rdx_%s.' % self.noid, regenerate_ids='true' if regenerate_ids else '') # set the result as ocr datastream content self.ocr.content = xmlmap.load_xmlobject_from_string(result) return True except etree.XMLSyntaxError: logger.warn('OCR xml for %s is invalid', self.pid) return False
def test_pageV1_0(self): # page 1.0 - abbyy ocr content page = PageV1_0(Mock()) # use mock for fedora api, since we won't make any calls page.pid = 'rdxtest:4607' page.page_order = 5 # vol = VolumeV1_0(Mock()) with patch('readux.books.models.PageV1_0.volume') as mockvolume: mockvolume.uriref = rdflib.URIRef('vol:1') mockvolume.display_label = 'Mabel Meredith' mockvolume.volume = None mockvolume.creator = ['Townley, Arthur'] mockvolume.date = '1863' # update fixture xml with ids with open(VolumeV1_0.ocr_add_ids_xsl) as xslfile: result = self.fr6v1.xsl_transform(filename=xslfile, return_type=unicode) fr6v1_with_ids = load_xmlobject_from_string(result, abbyyocr.Document) # use the first page with substantial text content as input ocr_page = fr6v1_with_ids.pages[5] teipage = page.generate_tei(ocr_page) # NOTE: uncomment to see generated TEI # print tei.serialize() # should be generating valid tei # if not tei.schema_valid(): # print tei.schema_validation_errors() self.assertTrue(teipage.schema_valid(), 'generated TEI facsimile should be schema-valid') # inspect the tei and check for expected values # - page identifier based on page_order value passed in self.assertEqual(ocr_page.id, teipage.page.id, 'tei id should be carried through from ocr xml') self.assertEqual(page.display_label, teipage.title, 'tei title should be set from page diplay label') # distributor not mapped in teimap, so just use xpath to check self.assertEqual(settings.TEI_DISTRIBUTOR, teipage.node.xpath('string(//t:publicationStmt/t:distributor)', namespaces={'t': teipage.ROOT_NS}), 'configured tei distributor should be set in publication statement') # recognized as abbyy input self.assert_('Abbyy file' in teipage.header.source_description, 'input should be recognized as Abbyy ocr') # brief bibliographic data self.assert_(mockvolume.display_label in teipage.header.source_description) self.assert_(mockvolume.creator[0] in teipage.header.source_description) self.assert_(mockvolume.date in teipage.header.source_description)
def test_multiple_cmodels(self): #first model r = RelsExt() r.about = 'info:fedora/test:123' m1 = Cmodel() m1.name = 'info:fedora/bdr-cmodel:commonMetadata' r.model.append(m1) #second model m2 = Cmodel() m2.name = 'info:fedora/bdr-cmodel:masterImage' r.model.append(m2) self.fox.rels_ext = r read_fox = load_xmlobject_from_string(self.fox.serialize(), Fox) self.assertTrue('info:fedora/bdr-cmodel:commonMetadata' in [m.name for m in read_fox.rels_ext.model]) self.assertTrue('info:fedora/bdr-cmodel:masterImage' in [m.name for m in read_fox.rels_ext.model])
def _render_item_to_rdf(self, xmlstring): # convenience method for testing ead file component rdf output # load xml as an ead series item component = load_xmlobject_from_string(xmlstring, Series) # render with the file_item template used in findingaid display self.ctxt.update({'component': component}) result = self.item_tmpl.render(self.ctxt) # parse as RDFa and return the resulting rdflib graph # - patch in namespaces before parsing as rdfa result = '<html xmlns:schema="%s" xmlns:bibo="%s">%s</html>' % \ (self.SCHEMA_ORG, self.BIBO, result) g = rdflib.Graph() g.parse(data=result, format='rdfa') return g
def test_add_users_and_build_hydra(self): self.builder.addReader('*****@*****.**').addReader('BROWN:GROUP') self.builder.addReader('*****@*****.**') self.builder.addEditor('*****@*****.**') self.builder.addReader('*****@*****.**').addDiscoverer('*****@*****.**') rights = self.builder.build_hydra() rights_str = rights.serialize(pretty=True) hydra_rights = load_xmlobject_from_string(rights_str, HydraRights) self.assertEqual(hydra_rights.discover_access_group, []) self.assertEqual(hydra_rights.discover_access_person, ['*****@*****.**']) self.assertEqual(hydra_rights.read_access_group, ['BROWN:GROUP']) self.assertEqual(sorted(hydra_rights.read_access_person), ['*****@*****.**', '*****@*****.**']) self.assertEqual(hydra_rights.edit_access_group, []) self.assertEqual(hydra_rights.edit_access_person, ['*****@*****.**']) self.assertEqual(hydra_rights.delete_access_group, []) self.assertEqual(hydra_rights.delete_access_person, [])
def process_step(self, form): if self.steps.current == '0': text_type = form.data['0-text_type'] text = form.data['0-text'] # Prepare message uima_response = {} uima_response['response'] = None uima_corr_id = str(uuid.uuid4()) uima_body = json.dumps({ 'text': text, 'mode': text_type, }) def uima_on_response(channel, method, props, body): if uima_corr_id == props.correlation_id: uima_response['response'] = body # Call UIMA uima_connection = BlockingConnection( ConnectionParameters(host=RABBITMQ_SERVER)) uima_channel = uima_connection.channel() uima_result = uima_channel.queue_declare(exclusive=True) uima_callback_queue = uima_result.method.queue uima_channel.basic_consume(uima_on_response, no_ack=True, queue=uima_callback_queue) uima_channel.basic_publish(exchange='', routing_key='uima_plain_worker', properties=BasicProperties( reply_to=uima_callback_queue, content_type='application/json', correlation_id=uima_corr_id, ), body=uima_body) while uima_response['response'] is None: uima_connection.process_data_events() # Transform result into HTML result = uima_response['response'] result = xmlmap.load_xmlobject_from_string(result, xmlclass=RocheTEI) result = result.body.xsl_transform(xsl=XSL_TRANSFORM_1).serialize() self.uima_result = result return self.get_form_step_data(form)
def get_items(self, **kwargs): '''Query the DigWF API getItems method. If no search terms are specified, getItems returns any items that are in the **Ready for Repository** state. Any keyword arguments will be passed to getItems as query arguments. Currently supports: * control_key (e.g., ocm or ocn number) - may match more than one item * item_id - the item id for the record in the DigWF * pid - the noid portion of the pid/ARK for the item :returns: :class:`Items` ''' url = '%s/getItems' % self.base_url r = requests.get(url, params=kwargs) if r.status_code == requests.codes.ok: return xmlmap.load_xmlobject_from_string(r.content, Items) # possible r.text ?
def load_bib_record(kdip): """ Method to load MARC XML from Am http://discovere.emory.edu:8991/cgi-bin/get_alma_record?item_id=010002483050 Method accepts a KDip object of a barcode as a string. """ if isinstance(kdip, basestring): barcode = kdip else: barcode = kdip.kdip_id get_bib_rec = requests.get( \ 'https://kleene.library.emory.edu/cgi-bin/get_alma_record?item_id=', \ params={'item_id': barcode}) return load_xmlobject_from_string( \ get_bib_rec.text.encode('utf-8'), models.Marc)
def process_article(self, pid, symp_pub, options): self.output(1, "Processing Article %s" % pid) # put article xml url = '%s/%s' % (self.pub_create_url, pid) status = None if symp_pub.is_empty(): self.output(1, "Skipping becase XML is empty") self.counts['skipped'] += 1 return valid = symp_pub.is_valid() self.output(2, "XML valid: %s" % valid) if not valid: self.output( 0, "Error publication xml is not valid for pid %s %s" % (pid, symp_pub.validation_errors())) self.counts['errors'] += 1 return if not options['noact']: response = self.session.put(url, data=symp_pub.serialize()) status = response.status_code self.output(2, "PUT %s %s" % (url, status if status else "<NO ACT>")) self.output( 2, "=====================================================================" ) self.output(2, symp_pub.serialize(pretty=True).decode('utf-8', 'replace')) self.output( 2, "---------------------------------------------------------------------" ) if status and status not in [200, 201]: self.output( 0, "Error publication PUT returned code %s for %s" % (status, pid)) self.counts['errors'] += 1 return elif not options['noact']: # checkd for warnings for w in load_xmlobject_from_string(response.raw.read(), OESympImportArticle).warnings: self.output(0, 'Warning: %s %s' % (pid, w.message)) self.counts['warnings'] += 1 self.counts['articles_processed'] += 1
def get_all_xpaths(self): result = set() if self.nodeset: result.add(self.nodeset) if self.has_variables(): for variable in self.get_variables(): result.add(variable.function) if self.actions: for action in self.actions: for frame in action.stack.frames: result.add(frame.if_clause) for datum in getattr(frame, 'datums', []): result.add(datum.value) def _get_graph_config_xpaths(configuration): result = set() for config in configuration.configs: result.add(config.xpath_function) return result for field in self.fields: if field.template.form == 'graph': s = etree.tostring(field.template.node) template = load_xmlobject_from_string(s, xmlclass=GraphTemplate) result.update( _get_graph_config_xpaths(template.graph.configuration)) for series in template.graph.series: result.add(series.nodeset) result.update( _get_graph_config_xpaths(series.configuration)) else: result.add(field.header.text.xpath_function) result.add(field.template.text.xpath_function) if field.template.text.xpath: for variable in field.template.text.xpath.variables: if variable.xpath: result.add(str(variable.xpath.function)) for detail in self.details: result.update(detail.get_all_xpaths()) result.discard(None) return result
def index(request): qs = TextAnnotation.objects.all() uima_latest = [] for uima in qs[:10]: q = xmlmap.load_xmlobject_from_string(uima.text.encode("utf-8"), xmlclass=RocheTEI) result = q.body.xsl_transform(xsl=XSL_TRANSFORM_1).serialize() # Remove div and p uima_latest.append([uima, result]) qs = Annotation.objects.all() annotation_latest = qs[:100] data = {'uima_latest': uima_latest, 'annotation_latest': annotation_latest, } return render(request, 'activity/index.html', data)
def get_items(self, **kwargs): '''Query the DigWF API getItems method. If no search terms are specified, getItems returns any items that are in the **Ready for Repository** state. Any keyword arguments will be passed to getItems as query arguments. Currently supports: * control_key (e.g., ocm or ocn number) - may match more than one item * item_id - the item id for the record in the DigWF * pid - the noid portion of the pid/ARK for the item :returns: :class:`Items` ''' url = '%s/getItems' % self.base_url r = requests.get(url, params=kwargs) if r.status_code == requests.codes.ok: return xmlmap.load_xmlobject_from_string( r.content, Items) # possible r.text ?
def UpdateCustom(server, path, purge=False): """ Function to update custom xml datastream for all existing objects. """ i = 0 username, password, root = repo.Get_Configs(server) repo = Repository(root=root, username=username, password=password) xml_files = (x for x in os.listdir(path) if "DATA.xml" in x) for xml in xml_files: pid = repo.Get_Pid(xml, repo) dsx = DatastreamXml(pid, server=server, repo=repo) if pid is not None: print "Object found for {0}".format(xml) custom_ds = CustomEtd(os.path.join(path, xml), server=server, pid=pid) root = custom_ds.CustomDs() custom_xml = os.path.join(path, xml.replace("DATA", "CUSTOM")) #with open(custom_xml, "w") as f: # f.write(etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True)) xml_object = xmlmap.load_xmlobject_from_string( etree.tostring(root, pretty_print=True, encoding='utf-8', xml_declaration=True)) if purge is True: dsx.digital_object.api.purgeDatastream(pid, "CUSTOM") print "PURGED CUSTOM" new_datastream = DatastreamObject( dsx.digital_object, "CUSTOM", "Custom metadata compiled by MSUL", mimetype="text/xml", control_group="X") new_datastream.content = xml_object new_datastream.label = "Custom metadata compiled by MSUL" new_datastream.save()
def show_annotated(request, uima_id): """ Show previously annotated UIMA result. """ try: uima = TextAnnotation.objects.get(pk=int(uima_id)) result = uima.text except: result = '' # TODO: catch XMLSyntaxError # XSLT transform result q = xmlmap.load_xmlobject_from_string(result.encode("utf-8"), xmlclass=RocheTEI) result = q.body.xsl_transform(xsl=XSL_TRANSFORM_1).serialize() # TODO: need an other template... data = {'tei_documents': [q], 'tei_transform': result, 'max_juan': 0, } return render_to_response('browser/text_view.html', data, context_instance=RequestContext(request))
def __init__(self,**kwargs): """ Initializes web OPAC address from passed in variable. """ if kwargs.has_key('opac_url'): self.opac_url = kwargs.get('opac_url') else: self.opac_url = None if kwargs.has_key('item_id'): self.item_id = kwargs.get('item_id') raw_xml_url = self.opac_url + self.item_id try: raw_xml = urllib2.urlopen(raw_xml_url).read() self.item_xml = xmlmap.load_xmlobject_from_string(raw_xml,xmlclass=ItemRecord) except: logging.error("ERROR with %s" % raw_xml_url) self.item_xml = None else: self.item_id = None
def call_api(**kwargs): if 'ak' not in kwargs: if hasattr(settings, 'ROMEO_API_KEY'): kwargs['ak'] = settings.ROMEO_API_KEY query_args = urlencode(kwargs) url = '%s?%s' % (API_BASE_URL, query_args) response_file = None response = None try: response_file = urlopen(url) response = xmlmap.load_xmlobject_from_string(response_file.read(), xmlclass=Response) finally: if response_file is not None: response_file.close() if response is not None: return response else: return Response() # dummy value to return when things have gone horribly wrong
def index(request): qs = TextAnnotation.objects.all() uima_latest = [] for uima in qs[:10]: q = xmlmap.load_xmlobject_from_string(uima.text.encode("utf-8"), xmlclass=RocheTEI) result = q.body.xsl_transform(xsl=XSL_TRANSFORM_1).serialize() # Remove div and p uima_latest.append([uima, result]) qs = Annotation.objects.all() annotation_latest = qs[:100] data = { 'uima_latest': uima_latest, 'annotation_latest': annotation_latest, } return render(request, 'activity/index.html', data)
def query(self, xquery=None, start=1, how_many=10, cache=False, session=None, release=None, result_type=None): """Execute an XQuery query, returning the results directly. :param xquery: a string XQuery query :param start: first index to return (1-based) :param how_many: maximum number of items to return :param cache: boolean, to cache a query and return a session id (optional) :param session: session id, to retrieve a cached session (optional) :param release: session id to be released (optional) :rtype: the resultType specified at the creation of this ExistDB; defaults to :class:`QueryResult`. """ # xml_s = self.server.query(xquery, how_many, start, kwargs) params = { '_howmany': how_many, '_start': start, } if xquery is not None: params['_query'] = xquery if cache: params['_cache'] = 'yes' if release is not None: params['_release'] = release if session is not None: params['_session'] = session if result_type is None: result_type = self.resultType opts = ' '.join('%s=%s' % (key.lstrip('_'), val) for key, val in params.items() if key != '_query') if xquery: debug_query = '\n%s' % xquery else: debug_query = '' logger.debug('query %s%s', opts, debug_query) start = time.time() response = self.session.get(self.restapi_path(''), params=params, stream=False, **self.session_opts) if xquery_called is not None: args = { 'xquery': xquery, 'start': start, 'how_many': how_many, 'cache': cache, 'session': session, 'release': release, 'result_type': result_type } xquery_called.send(sender=self.__class__, time_taken=time.time() - start, name='query', return_value=response, args=[], kwargs=args) if response.status_code == requests.codes.ok: # successful release doesn't return any content if release is not None: return True # successfully released # TODO: test unicode handling return xmlmap.load_xmlobject_from_string(response.content, result_type) # 400 bad request returns an xml error we can parse elif response.status_code == requests.codes.bad_request: err = xmlmap.load_xmlobject_from_string(response.content, ExistExceptionResponse) raise ExistDBException(err.message) # not sure if any information is available on other error codes else: raise ExistDBException(response.content)
def from_xml(cls, node): return load_xmlobject_from_string(ElementTree.tostring(node), cls)
def handle(self, *args, **options): self.verbosity = int( options['verbosity']) # 1 = normal, 0 = minimal, 2 = all self.v_normal = 1 #counters self.counts = defaultdict(int) #connection to repository repo = Repository(username=settings.FEDORA_MANAGEMENT_USER, password=settings.FEDORA_MANAGEMENT_PASSWORD) #Symplectic-Elements setup self.session = requests.Session() self.session.auth = (settings.SYMPLECTIC_USER, settings.SYMPLECTIC_PASSWORD) self.session.verify = False self.session.stream = True self.session.headers.update({'Content-Type': 'text/xml'}) self.pub_query_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publications") self.pub_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "publication/records/manual") self.relation_create_url = "%s/%s" % (settings.SYMPLECTIC_BASE_URL, "relationships") #if pids specified, use that list try: if len(args) != 0: pids = list(args) pid_set = [repo.get_object(pid=p, type=Article) for p in pids] else: #search for Articles. pid_set = repo.get_objects_with_cmodel( Article.ARTICLE_CONTENT_MODEL, Article) except Exception as e: raise CommandError('Error getting pid list (%s)' % e.message) try: articles = Paginator(pid_set, 20) self.counts['total'] = articles.count except Exception as e: self.output(0, "Error paginating items: : %s " % (e.message)) #process all Articles for p in articles.page_range: try: objs = articles.page(p).object_list except Exception as e: #print error and go to next iteration of loop self.output(0, "Error getting page: %s : %s " % (p, e.message)) self.counts['errors'] += 1 continue for article in objs: try: if not article.exists: self.output( 1, "Skipping %s because pid does not exist" % article.pid) self.counts['skipped'] += 1 continue title = article.descMetadata.content.title_info.title if ( article.descMetadata.content.title_info and article.descMetadata.content.title_info.title ) else None if title is None or title == '': self.output( 1, "Skipping %s because OE Title does not exist" % (article.pid)) self.counts['skipped'] += 1 continue if not article.is_published: self.output( 1, "Skipping %s because pid is not published" % article.pid) self.counts['skipped'] += 1 continue # try to detect article by PMC if article.pmcid and not options['force']: response = self.session.get( self.pub_query_url, params={ 'query': 'external-identifiers.pmc="PMC%s"' % article.pmcid, 'detail': 'full' }) entries = load_xmlobject_from_string( response.raw.read(), OESympImportArticle).entries self.output( 2, "Query for PMC Match: GET %s %s" % (response.url, response.status_code)) if response.status_code == 200: if len(entries) >= 1: self.output( 1, "Skipping %s because PMC PMC%s already exists" % (article.pid, article.pmcid)) self.counts['skipped'] += 1 if options['rel']: symp_pub, relations = article.as_symp( source=entries[0].source, source_id=entries[0].source_id) self.process_relations( entries[0].source_id, relations, options) sleep(1) continue else: self.output( 1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title)) self.counts['skipped'] += 1 continue # try to detect article by Title if it does not have PMC if not options['force']: response = self.session.get(self.pub_query_url, params={ 'query': 'title~"%s"' % title, 'detail': 'full' }) entries = load_xmlobject_from_string( response.raw.read(), OESympImportArticle).entries # Accouont for mutiple results titles = [e.title for e in entries] self.output( 2, "Query for Title Match: GET %s %s" % (response.url, response.status_code)) if response.status_code == 200: found = False for t in titles: success, percent = percent_match(title, t, 90) self.output( 1, "Percent Title Match '%s' '%s' %s " % (title, t, percent)) if success: found = True if found: self.output( 1, "Skipping %s because Title \"%s\" already exists" % (article.pid, title)) self.counts['skipped'] += 1 # update relations if rel is set if options['rel']: symp_pub, relations = article.as_symp( source=entries[0].source, source_id=entries[0].source_id) self.process_relations( entries[0].source_id, relations, options) sleep(1) continue else: self.output( 1, "Skipping %s because trouble with request %s %s" % (article.pid, response.status_code, entries[0].title)) self.counts['skipped'] += 1 continue # Process article and relations symp_pub, relations = article.as_symp() self.process_article(article.pid, symp_pub, options) self.process_relations(article.pid, relations, options) sleep(1) except Exception as e: self.output( 0, "Error processing pid: %s : %s " % (article.pid, e.message)) import traceback traceback.print_exc() self.counts['errors'] += 1 # summarize what was done self.stdout.write("\n\n") self.stdout.write("Total number selected: %s\n" % self.counts['total']) self.stdout.write("Skipped: %s\n" % self.counts['skipped']) self.stdout.write("Errors: %s\n" % self.counts['errors']) self.stdout.write("Warnings: %s\n" % self.counts['warnings']) self.stdout.write("Articles Processed: %s\n" % self.counts['articles_processed']) self.stdout.write("Relations Processed: %s\n" % self.counts['relations_processed'])
def setUp(self): super(TestMods, self).setUp() self.mods = load_xmlobject_from_string(self.FIXTURE, mods.MODS)
def from_string(cls, xml_string): """ Creates a Report from a XML string """ return xmlmap.load_xmlobject_from_string(xml_string, xmlclass=cls)
def from_xml(cls, node): return load_xmlobject_from_string(ElementTree.tostring(node, encoding='utf-8'), cls)
def annotation_to_tei(annotation, teivol): '''Generate a tei note from an annotation. Sets annotation id, slugified tags as ana attribute, username as resp attribute, and annotation content is converted from markdown to TEI. :param annotation: :class:`~readux.annotations.models.Annotation` :param teivol: :class:`~readux.books.tei.AnnotatedFacsimile` tei document, for converting related page ARK uris into TEI ids :returns: :class:`readux.books.tei.Note` ''' # NOTE: annotation created/edited dates are not included here # because they were determined not to be relevant for our purposes # sample note provided by Alice # <note resp="JPK" xml:id="oshnp50n1" n="1"><p>This is an example note.</p></note> # convert markdown-formatted text content to tei note_content = markdown_tei.convert(annotation.text) # markdown results could be a list of paragraphs, and not a proper # xml tree; also, pags do not include namespace # wrap in a note element and set the default namespace as tei teinote = load_xmlobject_from_string('<note xmlns="%s">%s</note>' % \ (teimap.TEI_NAMESPACE, note_content), tei.Note) # what id do we want? annotation uuid? url? teinote.id = 'annotation-%s' % annotation.id # can't start with numeric teinote.href = absolutize_url(annotation.get_absolute_url()) teinote.type = 'annotation' # if an annotation includes tags, reference them by slugified id in @ana if 'tags' in annotation.info() and annotation.info()['tags']: tags = ' '.join( set('#%s' % slugify(t.strip()) for t in annotation.info()['tags'])) teinote.ana = tags # if the annotation has an associated user, mark the author # as responsible for the note if annotation.user: teinote.resp = annotation.user.username # include full markdown of the annotation, as a backup for losing # content converting from markdown to tei, and for easy display teinote.markdown = annotation.text # if annotation contains related pages, generate a link group if annotation.related_pages: for rel_page in annotation.related_pages: page_ref = tei.Ref(text=rel_page, type='related page') # find tei page identifier from the page ark target = teivol.page_id_by_xlink(rel_page) if target is not None: page_ref.target = '#%s' % target teinote.related_pages.append(page_ref) # if annotation includes citations, add them to the tei # NOTE: expects these citations to be TEI encoded already (generated # by the zotero api and added via meltdown-zotero annotator plugin) if annotation.extra_data.get('citations', None): for bibl in annotation.extra_data['citations']: # zotero tei export currently includes an id that is not # a valid ncname (contains : and /) bibsoup = BeautifulSoup(bibl, 'xml') # convert xml id into the format we want: # zotero-#### (zotero item id) for bibl_struct in bibsoup.find_all('biblStruct'): bibl_struct['xml:id'] = 'zotero-%s' % \ bibl_struct['xml:id'].split('/')[-1] teibibl = load_xmlobject_from_string(bibsoup.biblStruct.prettify(), tei.BiblStruct) teinote.citations.append(teibibl) return teinote
def from_pid(cls, pid): r = requests.get(annotation_xml_url(pid)) if not r.ok: raise Exception('error retrieving annotation data for %s: %s - %s' % (pid, r.status_code, r.content)) mods_obj = load_xmlobject_from_string(r.content, mods.Mods) return cls(pid=pid, mods_obj=mods_obj)
def test_SubordinateComponents_noseries(self): # simple finding aid with no series but only a container list simple_dsc = """<dsc><c01 level="file"/></dsc>""" dsc = load_xmlobject_from_string(simple_dsc, eadmap.SubordinateComponents) self.assertFalse(dsc.hasSeries())
def parse(cls, xml): """Запуск парсера из XML""" return load_xmlobject_from_string(xml, cls)
def handle(self, *paths, **options): if not len(paths): raise CommandError('Please specify path to content for import.') if len(paths) > 1: # this limitation is kind of arbitrary, but keep thing simple for now raise CommandError( 'Import currently only supports a single volume.') path = paths[0] dry_run = options.get('dry_run', False) verbosity = options.get('verbosity', self.v_normal) repo = ManagementRepository() # make collection required to avoid accidentally forgetting it coll = options.get('collection', None) if coll is None: raise CommandError('Please specify collection pid') collection = repo.get_object(coll, type=Collection) if not collection.exists: raise CommandError('Collection %s does not exist' % coll) if not collection.has_requisite_content_models: raise CommandError('%s is not a collection' % coll) try: start = time.time() bag = bagit.Bag(path) # NOTE: could consider using fast validation, but files probably are # not so large or so numerous that this will be an issue if verbosity > self.v_normal: self.stdout.write('Validating bag %s' % path) fast_validate = options.get('fast_validate') bag.validate(fast=fast_validate) if verbosity >= self.v_normal: self.stdout.write( 'Validated %s in %.02fs %s' % (path, time.time() - start, '(fast validation enabled)' if fast_validate else '')) except bagit.BagError as err: # failed to load directory as a bag raise CommandError('Please supply a valid BagIt as input. %s' % err) except bagit.BagValidationError as err: # bag is not valid raise CommandError('Input is not a valid bag. %s' % err) files = {'pdf': None, 'marcxml': None, 'dc': None} checksums = {} # this is potentially a long list, but go ahead and store since we will # be consulting it multiple times payload_files = list(bag.payload_files()) # identify required contents within the bag by extension and name for data_path in payload_files: # path is relative to bag root dir filename = os.path.join(path, data_path) # get extension and name basename = os.path.basename(filename) basefile, ext = os.path.splitext(basename) # NOTE: splitext leaves . on the ext portion if ext.lower() == '.pdf': files['pdf'] = filename checksums['pdf'] = bag.entries[data_path].get('md5', None) elif ext.lower() == '.xml': if basefile.lower() == 'marc': files['marcxml'] = filename checksums['marcxml'] = bag.entries[data_path].get( 'md5', None) elif basefile.lower() == 'dc': files['dc'] = filename checksums['dc'] = bag.entries[data_path].get('md5', None) # check that required components are present err = False for label, filepath in files.iteritems(): if filepath is None: self.stderr.write('%s not found' % label.upper()) err = True elif checksums[label] is None: self.stderr.write('No MD5 checksum found for %s' % label.upper()) err = True if err: raise CommandError( 'Cannot import without all required files and checksums.') # all pieces are available, so proceed with ingest # construct book and ingest if verbosity > self.v_normal: self.stdout.write('Creating book object with marxml %s' % files['marcxml']) try: marcxml = load_xmlobject_from_file(files['marcxml'], MinMarcxml) except XMLSyntaxError as err: raise CommandError('Failed to load %s as xml: %s' % (files['marcxml'], err)) try: dcxml = load_xmlobject_from_file(files['dc'], DublinCore) except XMLSyntaxError as err: raise CommandError('Failed to load %s as xml: %s' % (files['dc'], err)) # look for book by ocm number first, in case a previous ingest failed book_pids = Book.pids_by_label(marcxml.ocm_number) # error if we find more than one if len(book_pids) > 1: raise CommandError('Multiple books exist with label %s. Please correct this first.' \ % marcxml.ocm_number) # if we find exactly one, use that instead of creating a new book elif len(book_pids) == 1: book = repo.get_object(book_pids[0], type=Book) if verbosity >= self.v_normal: self.stdout.write('Using existing book %s with ocm number %s' % \ (book.pid, marcxml.ocm_number)) # otherwise, ingest new book else: book = repo.get_object(type=Book) # set book label to ocm number from the marc book.label = marcxml.ocm_number if verbosity > self.v_normal: self.stdout.write('Book label %s' % book.label) # associate with collection if collection is not None: book.collection = collection if verbosity > self.v_normal: self.stdout.write('Associating with collection %s' % collection.short_label) book.marcxml.content = marcxml # NOTE: import checksum can't be used because xml may be serialized differently # book.marcxml.checksum = checksums['marcxml'] book.dc.content = dcxml # NOTE: import checksum can't be used because DC is modified to add ARK # book.dc.checksum = checksums['dc'] # save; bail if error if not dry_run: try: saved = book.save('ingest') if not saved: raise CommandError( 'Failed to ingest book into repository') if verbosity >= self.v_normal: self.stdout.write('Successfully ingested book %s' \ % book.pid) except RequestFailed as err: raise CommandError('Error ingesting book: %s' % err) # in case of pre-existing book object, check for existing volume if book.volume_set: if len(book.volume_set) > 1: raise CommandError('Book %s has multiple volumes; import not supported' \ % book.pid) else: # use existing volume object vol = book.volume_set[0] if verbosity >= self.v_normal: self.stdout.write('Using existing volume %s' % vol.pid) # otherwise, create new volume object else: # construct volume (v1.1), associate with book, and ingest if verbosity > self.v_normal: self.stdout.write('Creating volume with %s' % files['pdf']) with open(files['pdf']) as pdf_file: vol = repo.get_object(type=VolumeV1_1) # set volume label to ocm number from the marc + volume number # for consistency with lsdi content, use ocm_v# notation # V.0 indicates single-volume book vol.label = '%s_V.0' % marcxml.ocm_number # set pdf content vol.pdf.content = pdf_file vol.pdf.checksum = checksums['pdf'] # set relation to parent book object vol.book = book # minimal DC metadata derived from book metadata vol.dc.content.title = book.dc.content.title for t in book.dc.content.type_list: vol.dc.content.type_list.append(t) vol.dc.content.format = book.dc.content.format vol.dc.content.language = book.dc.content.language vol.dc.content.rights = book.dc.content.rights if not dry_run: try: saved = vol.save('ingest') if not saved: # NOTE: possibly, if this fails, we should deactivate the book object # but will leave that to manual processing for now raise CommandError( 'Failed to ingest volume into repository') else: if verbosity >= self.v_normal: self.stdout.write('Successfully ingested volume %s' \ % vol.pid) except RequestFailed as err: raise CommandError('Error ingesting volume: %s' % err) #### page import # if volume has existing pages, bail if len(vol.pages): raise CommandError('Volume %s already has %s page%s' % \ (vol.pid, len(vol.pages), '' if len(vol.pages) == 1 else 's')) # should page import happen here? # - identify numeric jp2/jpf files in the bag and get total count # - identify numeric .xml files in the bag and get total count # - make sure counts match up # Question: can we assume no start/end blank pages for now? # - start looping through, create page-1.1 and associate with book, # and ingest # - set first page as primary image on the volume # - report number of pages ingested image_files = [] # identify page files (images and ocr xml) for data_path in payload_files: # get extension and name basename = os.path.basename(data_path) basefile, ext = os.path.splitext(basename) if ext in ['.jp2', '.jpf']: image_files.append(data_path) # check that MD5 is present and bail if not # - this is probably redundant since by this point validation # has passed and previous content has checksums, but # ingest will assume checksums are available so better to error # *before* starting to ingest page-level content if bag.entries[data_path].get('md5', None) is None: raise CommandError('No MD5 checksum for %s' % data_path) # ensure pages are sorted into page-order image_files.sort() # NOTE: disabled for now; tunebook does not appear to include alto # for pages with no text content ## find matching page ocr files # for imgfile in image_files: # basefile, ext = os.path.splitext(imgfile) # ocrfile = '%s.xml' % basefile # if ocrfile not in payload_files: # raise CommandError('No OCR xml page present for %s (expected %s)' % \ # (imgfile, ocrfile)) # pre-generate empty xml in case we need it to force eulfedora to not # create ocr datastream when no ocr is present emptyxml = load_xmlobject_from_string('<empty/>') # iterate through page images and put into fedora pageindex = 1 for imgfile in image_files: if verbosity > self.v_normal: print 'Creating Page object for %s' % imgfile # path is relative to bag root dir img_filename = os.path.join(path, imgfile) page = repo.get_object(type=PageV1_1) # set page label page.label = '%s page %d' % (vol.label, pageindex) # set the relation to the volume object page.volume = vol logger.debug('Page %s volume %s' % (page.pid, page.volume.pid)) # set a dc:title based on volume title page.dc.content.title = '%s page %d' % (vol.dc.content.title, pageindex) # set page order page.page_order = pageindex with open(img_filename) as img_content: # set image content page.image.content = img_content page.image.checksum = bag.entries[imgfile]['md5'] # assume jpeg2000 for now (only looking for jp2/jpf) page.image.mimetype = 'image/jp2' # check for ocr xml within the bag, same base name as image basefile, ext = os.path.splitext(imgfile) ocrfile = '%s.xml' % basefile if ocrfile in payload_files: page.ocr.content = load_xmlobject_from_file( os.path.join(path, ocrfile)) # NOTE: can't use MD5 from bag because XML may be # serialized differently when sent to Fedora # (unless we treat as file instead of xml...) # page.ocr.checksum = bag.entries[ocrfile]['md5'] if verbosity > self.v_normal: print 'Setting OCR for Page from %s' % ocrfile else: # warn but do not error if ocr xml is not found self.stdout.write('Warning: no OCR xml found for %s' % imgfile) # explicitly set xml content to empty so eulfedora doesn't # attempt to bootstrap & ingest (and error) page.ocr.content = emptyxml if not dry_run: try: # for now, if any page ingest errors, bail out # (unclear what would cause load to fail midway) saved = page.save() if not saved: raise CommandError('Failed to ingest page %d into repository' \ % pageindex) except RequestFailed as err: raise CommandError('Error ingesting page %d: %s' % (pageindex, err)) # set first page as primary image for the volume if not dry_run and pageindex == 1: vol.primary_image = page vol.save('adding primary image relation') # increase page index for next page pageindex += 1 if verbosity >= self.v_normal: # total is pageindex - 1 since pageindex incremented at end of loop self.stdout.write('Created %d pages' % (pageindex - 1))
def test_update_instance(self): # initialize data the same way a view processing a POST would update_form = TestForm(self.post_data, instance=self.testobj) # check that form is valid - if no errors, this populates cleaned_data self.assertTrue(update_form.is_valid()) instance = update_form.update_instance() self.assert_(isinstance(instance, TestObject)) self.assertEqual(21, instance.int) self.assertEqual(False, instance.bool) self.assertEqual('b', instance.id) self.assertEqual('completely new text content', instance.longtext) self.assertEqual(0, instance.other_child.val) # spot check that values were set properly in the xml xml = instance.serialize() self.assert_('id="b"' in xml) self.assert_('<boolean>no</boolean>' in xml) # test save on form with no pre-existing xmlobject instance class SimpleForm(XmlObjectForm): class Meta: model = TestObject fields = ['id', 'bool', 'longtext'] # fields with simple, top-level xpaths # creation for nested node not yet supported in xmlmap - excluding int exclude = ['child'] # exclude subform to simplify testing new_form = SimpleForm({ 'id': 'A1', 'bool': True, 'longtext': 'la-di-dah' }) self.assertTrue(new_form.is_valid()) instance = new_form.update_instance() self.assert_( isinstance(instance, TestObject), "update_instance on unbound xmlobjectform returns correct xmlobject instance" ) self.assertEqual(True, instance.bool) self.assertEqual('A1', instance.id) self.assertEqual('la-di-dah', instance.longtext) # spot check values in created-from-scratch xml xml = instance.serialize() self.assert_('id="A1"' in xml) self.assert_('<boolean>yes</boolean>' in xml) # formset deletion data = self.post_data.copy() # update post data to test deleting items data.update({ 'children-INITIAL_FORMS': 4, # only initial forms can be deleted 'children-0-DELETE': True, 'children-2-DELETE': True, }) # make a copy object, since the instance will be updated by the form testobj = xmlmap.load_xmlobject_from_string(self.testobj.serialize(), TestObject) update_form = TestForm(data, instance=self.testobj) # check that form is valid - if no errors, this populates cleaned_data self.assertTrue(update_form.is_valid()) instance = update_form.update_instance() # children 0 and 2 should be removed from the updated instance self.assert_(testobj.children[0] not in instance.children) self.assert_(testobj.children[2] not in instance.children)
def setUp(self): self.dc = load_xmlobject_from_string(self.FIXTURE, DublinCore)
def setUp(self): # instance of form with no test object self.new_form = TestForm() # instance of form with test object instance self.testobj = xmlmap.load_xmlobject_from_string(FIXTURE_TEXT, TestObject) self.update_form = TestForm(instance=self.testobj)