def _oai2d_endpoint_identifiers(self): """Return a set of the Community OAI Set recids from OAI endpoint.""" with patch('sickle.app.requests.get', new=sickle_requests_get_mock()): sickle = Sickle('http://auditor/oai2d') ids = sickle.ListIdentifiers(set=self.community.oaiset_spec, metadataPrefix='oai_dc') return {int(i.identifier.rsplit(':', 1)[-1]) for i in ids}
def test_list_oai_collections(self, community): """ Constructs list of tuples of collections (a seconday grouping concept in OAI) "owned" by the given community. Utilizes OAI-PMH verbs: ListIdentifiers and ListSets """ sickle = Sickle(community.repository.base_url) # Retrieve collections associated with community parameter record_headers = sickle.ListIdentifiers(metadataPrefix='oai_dc', set=community.identifier) # Filter record headers to build collection map from the community community_collections = {} for i in record_headers: # Iterate over associated sets looking for collections for j in i.setSpecs: if j[:3] == 'col': community_collections[ j] = None # register collection id in map # Map names to ids in collection map {setSpec: setName} # listsets oai request returns the 'setName' of the collection in metadata... for i in sickle.ListSets(): modstr = 'col' + i.setSpec[ 3:] # Bug in oai? in set results a 'collection' has a prefix of 'com'! if modstr in community_collections: # checks for a mapped collection identifier community_collections[modstr] = i.setName # Convert map to list of tuples self.collections = community_collections.items() # Sort collections by name self.collections = sorted(self.collections, key=lambda i: i[1]) return self.collections
def listIdentifiers(request): """ POST http://localhost/oai_pmh/api/listidentifiers POST data query='{"url":"value", "metadataprefix":"value"}' optional {"set":"value"} """ try: serializer = RegistryURLSerializer(data=request.DATA) if serializer.is_valid(): url = request.DATA['url'] metadataprefix = request.DATA['metadataprefix'] setH = request.DATA.get('set', None) sickle = Sickle(url) rsp = sickle.ListIdentifiers(metadataPrefix=metadataprefix, set=setH) rtn = [] try: while True: rtn.append( dict(rsp.next()) ) except StopIteration: pass serializer = ListIdentifierSerializer(rtn) return Response(serializer.data, status=status.HTTP_200_OK) else: raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST) except OAIAPIException as e: return e.response() except Exception as e: content = APIMessage.getMessageLabelled('An error occurred when attempting to identify resource: %s'%e.message) return Response(content, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
def generate(self): """Returns a list of ResourceSync resources that each represent one full OAI-PMH record (i.e., the result of a GetRecord request). """ provider = Sickle(self.params['oaipmh_base_url']) headers = provider.ListIdentifiers( ignore_deleted=True, set=self.params['oaipmh_set'], metadataPrefix=self.params['oaipmh_metadataprefix']) return list(map(self.oaipmh_header_to_resourcesync_resource, headers))
def harvest(host, from_date, until, format, out, set, verbose): counter = 0 if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.info("OAI-PMH harvesting from %s", host) logging.info("From date = %s", from_date) logging.info("Until date = %s", until) logging.info("Metadata format = %s", format) logging.info("Outfile = %s", out) mysickle = Sickle(host, iterator=OAIItemIterator) params = {'metadataPrefix': format, 'from': from_date, 'until': until} if set is not None: params['set'] = set try: responses = mysickle.ListIdentifiers(**params) except NoRecordsMatch: logging.info("No records harvested: the combination of the values of " "the arguments results in an empty list.") sys.exit() identifier_list = [] for records in responses: identifier_list.append(records.identifier) logging.info(f"Identifier count to harvest: {len(identifier_list)}") with open(out, 'wb') as f: f.write('<records>'.encode()) for identifier in identifier_list: r = mysickle.GetRecord(identifier=identifier, metadataPrefix=format) f.write(r.raw.encode('utf8')) logging.debug(counter) logging.debug(r.raw) counter += 1 f.write('</records>'.encode()) logging.info("Total records harvested: %i", counter)
class TestCase(unittest.TestCase): def __init__(self, methodName='runTest'): super(TestCase, self).__init__(methodName) self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest) def setUp(self): self.patch.start() self.sickle = Sickle('http://localhost') def tearDown(self): self.patch.stop() def test_OAIResponse(self): response = self.sickle.harvest(verb='ListRecords', metadataPrefix='oai_dc') self.assertIsInstance(response.xml, etree._Element) self.assertIsInstance(response.raw, string_types) def test_broken_XML(self): response = self.sickle.harvest(verb='ListRecords', resumptionToken='ListRecordsBroken.xml') self.assertEqual(response.xml, None) self.assertIsInstance(response.raw, string_types) def test_ListRecords(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc') assert len([r for r in records]) == 8 def test_ListRecords_ignore_deleted(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True) num_records = len([r for r in records]) assert num_records == 4 def test_ListSets(self): set_iterator = self.sickle.ListSets() sets = [s for s in set_iterator] self.assertEqual(131, len(sets)) dict(sets[0]) def test_ListMetadataFormats(self): mdf_iterator = self.sickle.ListMetadataFormats() mdfs = [mdf for mdf in mdf_iterator] self.assertEqual(5, len(mdfs)) dict(mdfs[0]) def test_ListIdentifiers(self): records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc') self.assertEqual(len([r for r in records]), 4) def test_ListIdentifiers_ignore_deleted(self): records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc', ignore_deleted=True) # There are 2 deleted headers in the test data num_records = len([r for r in records]) self.assertEqual(num_records, 2) def test_Identify(self): identify = self.sickle.Identify() self.assertTrue(hasattr(identify, 'repositoryName')) self.assertTrue(hasattr(identify, 'baseURL')) self.assertTrue(hasattr(identify, 'adminEmail')) self.assertTrue(hasattr(identify, 'earliestDatestamp')) self.assertTrue(hasattr(identify, 'deletedRecord')) self.assertTrue(hasattr(identify, 'granularity')) self.assertTrue(hasattr(identify, 'description')) self.assertTrue(hasattr(identify, 'oai_identifier')) self.assertTrue(hasattr(identify, 'sampleIdentifier')) dict(identify) def test_GetRecord(self): oai_id = 'oai:test.example.com:1996652' record = self.sickle.GetRecord(identifier=oai_id) self.assertEqual(record.header.identifier, oai_id) self.assertIn(oai_id, record.raw) self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z') self.assertIsInstance(record.xml, etree._Element) binary_type(record) text_type(record) dict(record.header) self.assertEqual(dict(record), record.metadata) # Test OAI-specific exceptions @raises(BadArgument) def test_badArgument(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument') @raises(CannotDisseminateFormat) def test_cannotDisseminateFormat(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='cannotDisseminateFormat') @raises(IdDoesNotExist) def test_idDoesNotExist(self): self.sickle.GetRecord(metadataPrefix='oai_dc', error='idDoesNotExist') @raises(NoSetHierarchy) def test_noSetHierarchy(self): self.sickle.ListSets(metadataPrefix='oai_dc', error='noSetHierarchy') @raises(BadResumptionToken) def test_badResumptionToken(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='badResumptionToken') @raises(NoRecordsMatch) def test_noRecordsMatch(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='noRecordsMatch') @raises(OAIError) def test_undefined_OAI_error_XML(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='undefinedError') def test_OAIResponseIterator(self): sickle = Sickle('fake_url', iterator=OAIResponseIterator) records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')] self.assertEqual(len(records), 4)
def get_identifiers_in_set(setSpec): sickle = Sickle(admin.get_repository_url()) return sickle.ListIdentifiers( **{ 'metadataPrefix': 'oai_dc', 'set': setSpec, })
from oaiharvests.utils import * from oaiharvests.models import * from sickle import Sickle # com = Community.objects.all()[0] # oai = OAIUtils() # oai.list_oai_collections(com) base_url = 'http://scholarspace.manoa.hawaii.edu/dspace-oai/request' llt_id = 'com_10125_27123' s = Sickle(base_url) record_headers = list(s.ListIdentifiers(metadataPrefix='oai_dc', set=llt_id)) community_collections = {} for i in record_headers: # Iterate over associated sets looking for collections for j in i.setSpecs: if j[:3] == 'col': community_collections[j] = None # register id in map for i in s.ListSets(): try: print community_collections[i.setSpec] community_collections[i.setSpec] = i.setName print i.setSpec, '==>', community_collections[i.setSpec] print i except KeyError as e:
class OAIHarvester(Harvester): def __init__(self, community, url, oai_metadata_prefix, oai_set, fromdate, clean, limit, outdir, verify): super().__init__(community, url, fromdate, clean, limit, outdir, verify) logging.captureWarnings(True) self.mdprefix = oai_metadata_prefix self.oai_set = oai_set self.sickle = Sickle(self.url, max_retries=3, timeout=120, verify=self.verify) def identifier(self, record): return record.header.identifier def matches(self): try: records = self.sickle.ListIdentifiers( **{ 'metadataPrefix': self.mdprefix, 'set': self.oai_set, 'ignore_deleted': True, 'from': self.fromdate, }) # TODO: complete_list_size is not always set by OAI matches = int(records.resumption_token.complete_list_size) except Exception: logging.warning('Could not get complete list size from OAI.') matches = super().matches() return matches def check_metadata_format(self): md_formats = None try: md_formats = [ f.metadataPrefix for f in self.sickle.ListMetadataFormats() ] except Exception: logging.warning( "OAI does not support ListMetadataFormats request.") if md_formats and self.mdprefix not in md_formats: logging.error( f'The metadata format {self.mdprefix} is not supported by the OAI repository. Formats={md_formats}' ) def get_records(self): self.check_metadata_format() # NOTE: use dict args to pass "from" parameter # https://sickle.readthedocs.io/en/latest/tutorial.html#using-the-from-parameter try: records = self.sickle.ListRecords( **{ 'metadataPrefix': self.mdprefix, 'set': self.oai_set, 'ignore_deleted': True, 'from': self.fromdate, }) for record in records: yield record except NoRecordsMatch: logging.warning( f'No records match the OAI query. from={self.fromdate}') except CannotDisseminateFormat: raise HarvesterError( f'The metadata format {self.mdprefix} is not supported by the OAI repository.' ) def _write_record(self, fp, record, pretty_print=True): xml = etree.tostring(record.xml, pretty_print=pretty_print).decode('utf8') fp.write(xml)