Ejemplo n.º 1
0
def harvest(url):
    registry = MetadataRegistry()
    registry.registerReader('oai_dc', oai_dc_reader)

    client = Client(url, registry)
    client.ignoreBadCharacters(true_or_false=True)

    identifiers = []
    for header in client.listIdentifiers(metadataPrefix='oai_dc'):
        # if (not(header.isDeleted())):
        print(f"Found identifier {header.identifier()}")
        identifiers.append(header.identifier())
        # else:
        #     print(f"Skipping (DELETED) identifier {header.identifier()}")

    print(f"Total number of identifiers: {len(identifiers)}")

    # Only get the identifier string at the end of the url
    identifiers = [x.split('/')[-1] for x in identifiers]

    dirname = os.path.dirname(__file__)
    filename = os.path.join(dirname, 'philarchive-2.txt')

    with open(filename, 'w') as f:
        print(f"Writing to {filename}")
        f.writelines('\n'.join(identifiers))
Ejemplo n.º 2
0
    def list_oai_collections(self, community):
        """ Retrieve the header data for each record in the current community repo """

        try:
            registry = MetadataRegistry()
            registry.registerReader('oai_dc', oai_dc_reader)
            client = Client(community.repository.base_url, registry)
            records = client.listIdentifiers(
                metadataPrefix='oai_dc', set=community.identifier)
        except:
            community_collections = set()
            return


        """ Filter records to build list of collections in the community set """
        community_collections = set()
        for i in records:
            for j in i.setSpec():
                if j[:3] == 'col':
                    community_collections.add(j)
    
        print len(community_collections)
        """ Build collection tuples (identifier, name) """
        for i in community_collections:
            # print i
            # print community_collections
            
            set_data = []
            set_data.append(i)  # Store identifier
            set_data.append('Collection: %s'%i)  # Store human readable name
            # print set_data
            self.collections.append(set_data)
Ejemplo n.º 3
0
def list_identifiers(target, date_from, date_until, setspec):
    if target is not None:
        client = Client(target['url'], registry)
        headers = client.listIdentifiers(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
        results = []
        if headers is not None:
            for header in headers:
                results.append(convert_header(header))
        return results
Ejemplo n.º 4
0
def list_identifiers(target, date_from, date_until, setspec):
    if target is not None:
        client = Client(target['url'], registry)
        headers = client.listIdentifiers(metadataPrefix=target['metadata_prefix'], from_=date_from, until=date_until, set=setspec)
        results = []
        if headers is not None:
            for header in headers:
                results.append(convert_header(header))
        return results
Ejemplo n.º 5
0
class Repository(object):
    """ Repository handles interaction with the various interfaces provided by 
    the dspace repository. """
    def __init__(self, url=None, **kwargs):
        self.base_url = kwargs.pop('base_url', None)
        self.oai_path = kwargs.pop('oai_path', None)

        self.oai_enabled = bool(kwargs.pop('oai_enabled', True))
        self.sword_enabled = bool(kwargs.pop('sword_enabled', False))

        if url is not None:
            warn(
                'The url paramater will not be supported in version 3, '
                'use base_url and oai_path instead', DeprecationWarning)

            if (self.base_url and url.startswith(self.base_url)
                    and self.oai_path is None):
                self.oai_path = url.replace(self.base_url, '', 1).lstrip('/')
            elif not self.base_url:
                if self.oai_path is None:
                    self.oai_path = 'dspace-oai/request'
                if url.endswith(self.oai_path):
                    self.base_url = url[:-(len(self.oai_path) + 1)]

        if self.base_url is None:
            raise ValueError('base_url argument must be specified')

        if not 'metadata_registry' in kwargs:
            kwargs['metadata_registry'] = MetadataRegistry()
            kwargs['metadata_registry'].registerReader('mets',
                                                       dspace_mets_reader)

        if self.sword_enabled:
            skwargs = {'base_url': self.base_url}

            for key in kwargs.keys():
                if key.startswith('sword_'):
                    skwargs[key[6:]] = kwargs.pop(key)

            self.sword = SwordService(**skwargs)

        if self.oai_enabled:
            self.oai = Client('/'.join((
                self.base_url,
                self.oai_path,
            )), **kwargs)

        self.identifier_base = self._extractIdentifierBase(self.base_url)

    def _extractIdentifierBase(self, url):
        """ From a given URL, extract the OAI identifier base (hostname) """
        return urlparse(url).hostname

    def _extractSet(self, handle):
        """ Determine the OAI set from a collection handle """
        if not isinstance(handle, basestring):
            raise ValueError('Collection handles must be strings')
        return 'hdl_' + handle.replace('/', '_').replace(':', '_')

    def getName(self):
        """ Get the configured name of the repository """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        return self.oai.identify().repositoryName()

    def getCollections(self):
        """ Get a list of the collections in the repository """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        return map(lambda c: c[0:2], self.oai.listSets())

    def getItemHandles(self, collection=None, **kw):
        """ Get item handles from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        for item in self.getItemIdentifiers(collection=collection, **kw):
            yield item.identifier().split(':', 2)[2]

    def getItemIdentifiers(self, collection=None, **kw):
        """ Get item identifiers from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kw.setdefault('metadataPrefix', 'mets')

        if collection:
            kw['set'] = self._extractSet(collection)

        return self.oai.listIdentifiers(**kw)

    def getItems(self, collection=None, **kw):
        """ Get full items from the OAI-PMH interface """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kw.setdefault('metadataPrefix', 'mets')

        if collection:
            kw['set'] = self._extractSet(collection)

        return self.oai.listRecords(**kw)

    def getItem(self, handle=None, identifier=None, **kwargs):
        """ Get a single item from the OAI-PMH interface either by handle or 
        identifier """
        assert self.oai_enabled, 'Requires OAI-PMH to be enabled'
        kwargs.setdefault('metadataPrefix', 'mets')

        if handle is None and identifier is None:
            raise ValueError('Either handle or identifier must be provided')

        if handle is not None:
            if identifier is not None:
                raise ValueError('Either a handle or identifier must be '
                                 'provided, not both')

            identifier = 'oai:%s:%s' % (
                self.identifier_base,
                handle,
            )

        return self.oai.getRecord(identifier=identifier, **kwargs)

    def getOAIItemIdentifier(self, handle):
        return 'oai:%s:%s' % (self._extractIdentifierBase(
            self.base_url), handle)

    def getSwordCollections(self):
        pass

    def getSwordCollection(self, args):
        pass
Ejemplo n.º 6
0
print id.repositoryName()
print id.adminEmails()
print id.baseURL()

formats = oai.listMetadataFormats()
pprint formats

# 'marc21'

sets = oai.listSets()
for s in sets:
	print s

# 'MZK03'

recids = oai.listIdentifiers(metadataPrefix='marc21', set='MZK03') # from_='2003-01-01T00:00:00Z', until=''

# for example: 'MZK03-907223' is in the list of maps
# or 356050 *not a map

# 238208 problematic
r = oai.getRecord(identifier='MZK03-1479', metadataPrefix='marc21')

# from lxml import etree
# print etree.tostring(r[1],pretty_print=True)

# xpath_evaluator = etree.XPathEvaluator(r[1][0], namespaces={'marc21':'http://www.loc.gov/MARC21/slim'})
# e = xpath_evaluator.evaluate

#s = etree.tostring(r[1][0],pretty_print=True)