def harvest(url): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(url, registry) client.ignoreBadCharacters(true_or_false=True) identifiers = [] for header in client.listIdentifiers(metadataPrefix='oai_dc'): # if (not(header.isDeleted())): print(f"Found identifier {header.identifier()}") identifiers.append(header.identifier()) # else: # print(f"Skipping (DELETED) identifier {header.identifier()}") print(f"Total number of identifiers: {len(identifiers)}") # Only get the identifier string at the end of the url identifiers = [x.split('/')[-1] for x in identifiers] dirname = os.path.dirname(__file__) filename = os.path.join(dirname, 'philarchive-2.txt') with open(filename, 'w') as f: print(f"Writing to {filename}") f.writelines('\n'.join(identifiers))
def list_oai_collections(self, community): """ Retrieve the header data for each record in the current community repo """ try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(community.repository.base_url, registry) records = client.listIdentifiers( metadataPrefix='oai_dc', set=community.identifier) except: community_collections = set() return """ Filter records to build list of collections in the community set """ community_collections = set() for i in records: for j in i.setSpec(): if j[:3] == 'col': community_collections.add(j) print len(community_collections) """ Build collection tuples (identifier, name) """ for i in community_collections: # print i # print community_collections set_data = [] set_data.append(i) # Store identifier set_data.append('Collection: %s'%i) # Store human readable name # print set_data self.collections.append(set_data)
def list_identifiers(target, date_from, date_until, setspec): if target is not None: client = Client(target['url'], registry) headers = client.listIdentifiers(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec) results = [] if headers is not None: for header in headers: results.append(convert_header(header)) return results
class Repository(object): """ Repository handles interaction with the various interfaces provided by the dspace repository. """ def __init__(self, url=None, **kwargs): self.base_url = kwargs.pop('base_url', None) self.oai_path = kwargs.pop('oai_path', None) self.oai_enabled = bool(kwargs.pop('oai_enabled', True)) self.sword_enabled = bool(kwargs.pop('sword_enabled', False)) if url is not None: warn( 'The url paramater will not be supported in version 3, ' 'use base_url and oai_path instead', DeprecationWarning) if (self.base_url and url.startswith(self.base_url) and self.oai_path is None): self.oai_path = url.replace(self.base_url, '', 1).lstrip('/') elif not self.base_url: if self.oai_path is None: self.oai_path = 'dspace-oai/request' if url.endswith(self.oai_path): self.base_url = url[:-(len(self.oai_path) + 1)] if self.base_url is None: raise ValueError('base_url argument must be specified') if not 'metadata_registry' in kwargs: kwargs['metadata_registry'] = MetadataRegistry() kwargs['metadata_registry'].registerReader('mets', dspace_mets_reader) if self.sword_enabled: skwargs = {'base_url': self.base_url} for key in kwargs.keys(): if key.startswith('sword_'): skwargs[key[6:]] = kwargs.pop(key) self.sword = SwordService(**skwargs) if self.oai_enabled: self.oai = Client('/'.join(( self.base_url, self.oai_path, )), **kwargs) self.identifier_base = self._extractIdentifierBase(self.base_url) def _extractIdentifierBase(self, url): """ From a given URL, extract the OAI identifier base (hostname) """ return urlparse(url).hostname def _extractSet(self, handle): """ Determine the OAI set from a collection handle """ if not isinstance(handle, basestring): raise ValueError('Collection handles must be strings') return 'hdl_' + handle.replace('/', '_').replace(':', '_') def getName(self): """ Get the configured name of the repository """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' return self.oai.identify().repositoryName() def getCollections(self): """ Get a list of the collections in the repository """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' return map(lambda c: c[0:2], self.oai.listSets()) def getItemHandles(self, collection=None, **kw): """ Get item handles from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' for item in self.getItemIdentifiers(collection=collection, **kw): yield item.identifier().split(':', 2)[2] def getItemIdentifiers(self, collection=None, **kw): """ Get item identifiers from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kw.setdefault('metadataPrefix', 'mets') if collection: kw['set'] = self._extractSet(collection) return self.oai.listIdentifiers(**kw) def getItems(self, collection=None, **kw): """ Get full items from the OAI-PMH interface """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kw.setdefault('metadataPrefix', 'mets') if collection: kw['set'] = self._extractSet(collection) return self.oai.listRecords(**kw) def getItem(self, handle=None, identifier=None, **kwargs): """ Get a single item from the OAI-PMH interface either by handle or identifier """ assert self.oai_enabled, 'Requires OAI-PMH to be enabled' kwargs.setdefault('metadataPrefix', 'mets') if handle is None and identifier is None: raise ValueError('Either handle or identifier must be provided') if handle is not None: if identifier is not None: raise ValueError('Either a handle or identifier must be ' 'provided, not both') identifier = 'oai:%s:%s' % ( self.identifier_base, handle, ) return self.oai.getRecord(identifier=identifier, **kwargs) def getOAIItemIdentifier(self, handle): return 'oai:%s:%s' % (self._extractIdentifierBase( self.base_url), handle) def getSwordCollections(self): pass def getSwordCollection(self, args): pass
print id.repositoryName() print id.adminEmails() print id.baseURL() formats = oai.listMetadataFormats() pprint formats # 'marc21' sets = oai.listSets() for s in sets: print s # 'MZK03' recids = oai.listIdentifiers(metadataPrefix='marc21', set='MZK03') # from_='2003-01-01T00:00:00Z', until='' # for example: 'MZK03-907223' is in the list of maps # or 356050 *not a map # 238208 problematic r = oai.getRecord(identifier='MZK03-1479', metadataPrefix='marc21') # from lxml import etree # print etree.tostring(r[1],pretty_print=True) # xpath_evaluator = etree.XPathEvaluator(r[1][0], namespaces={'marc21':'http://www.loc.gov/MARC21/slim'}) # e = xpath_evaluator.evaluate #s = etree.tostring(r[1][0],pretty_print=True)