def gather_stage(self, harvest_job): url = harvest_job.source.url # Test wether we should use OAI-PMH or DDI metadata_registry = MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(url, metadata_registry) try: client.identify() except XMLSyntaxError: self.harvester = DDIHarvester() except urllib2.URLError: self._save_gather_error('Could not identify source!', harvest_job) return None if not self.harvester: self.harvester = OAIPMHHarvester() objs = self.harvester.gather_stage(harvest_job) ret = [] for obj in objs: obj = HarvestObject.get(obj) cont = obj.content dict = json.loads(cont) dict['harv'] = jsonpickle.encode(self.harvester) obj.content = json.dumps(dict) obj.save() ret.append(obj.id) return ret
def scrape(self): raise Exception("not finished") registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) url = self.setting('pmh-endpoint') client = Client(url, registry) print " OAI Repository", url print " Available sets:" for s in client.listSets(): print " ", s oai_set = self.setting('set') oai_from = self.setting('from') oai_until = self.setting('until') kwargs = {} if oai_set: kwargs['set'] = oai_set if oai_from is not None: date_args = [int(arg) for arg in oai_from.split("-")] kwargs['from_'] = datetime.datetime(*date_args) if oai_until is not None: date_args = [int(arg) for arg in oai_until.split("-")] kwargs['until'] = datetime.datetime(*date_args) records = [r for r in client.listRecords(metadataPrefix='oai_dc', **kwargs)] data_filepath = os.path.join(self.work_dir(), self.setting('data-file')) with open(data_filepath, 'wb') as f: print " picking", len(records), "records" pickle.dump(records, f)
def init(user): fullURL = URL+user registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(fullURL, registry) logging.info('The community %s harvested', user) return(client)
def __init__(self, oaisource, day_granularity=False, *args, **kwargs): """ This sets up the paper source. :param oaisource: the OAISource to fetch from. :param day_granularity: should we use day-granular timestamps to fetch from the proxy or full timestamps (default: False, full timestamps) See the protocol reference for more information on timestamp granularity: https://www.openarchives.org/OAI/openarchivesprotocol.html """ super(OaiPaperSource, self).__init__(*args, **kwargs) if not oaisource.endpoint: raise ValueError( 'No OAI endpoint was configured for this OAI source.') self.registry = MetadataRegistry() self.registry.registerReader('oai_dc', oai_dc_reader) self.registry.registerReader('base_dc', base_dc_reader) self.client = Client(oaisource.endpoint, self.registry) self.client._day_granularity = day_granularity self.translators = { 'oai_dc': OAIDCTranslator(oaisource), 'base_dc': BASEDCTranslator(oaisource), }
def __init__(self, url): registry = MetadataRegistry() registry.registerReader(ZoraAPI.METADATA_PREFIX, oai_dc_reader) self.client = Client(url, registry) self.institutes = {} self.resource_types = [] self.load_institutes_and_types()
def __init__(self, endpoint, day_granularity=False, *args, **kwargs): """ This sets up the paper source. :param endpoint: the address of the OAI-PMH endpoint to fetch from. :param day_granularity: should we use day-granular timestamps to fetch from the proxy or full timestamps (default: False, full timestamps) See the protocol reference for more information on timestamp granularity: https://www.openarchives.org/OAI/openarchivesprotocol.html """ super(OaiPaperSource, self).__init__(*args, **kwargs) self.registry = MetadataRegistry() self.registry.registerReader('oai_dc', oai_dc_reader) self.registry.registerReader('base_dc', base_dc_reader) self.registry.registerReader('citeproc', citeproc_reader) self.client = Client(endpoint, self.registry) self.client._day_granularity = day_granularity if settings.PROAIXY_API_KEY: self.client.extra_parameters = { 'key': settings.PROAIXY_API_KEY} self.translators = {}
def harvest(url): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(url, registry) client.ignoreBadCharacters(true_or_false=True) identifiers = [] for header in client.listIdentifiers(metadataPrefix='oai_dc'): # if (not(header.isDeleted())): print(f"Found identifier {header.identifier()}") identifiers.append(header.identifier()) # else: # print(f"Skipping (DELETED) identifier {header.identifier()}") print(f"Total number of identifiers: {len(identifiers)}") # Only get the identifier string at the end of the url identifiers = [x.split('/')[-1] for x in identifiers] dirname = os.path.dirname(__file__) filename = os.path.join(dirname, 'philarchive-2.txt') with open(filename, 'w') as f: print(f"Writing to {filename}") f.writelines('\n'.join(identifiers))
def setUp(self): self.registry = MetadataRegistry() self.registry.registerReader('mets', dspace_mets_reader) self.element = etree.parse( os.path.join(os.path.dirname(__file__), 'dspace_mets.xml')).getroot() self.item = self.registry.readMetadata('mets', self.element)
def arxiv_oai_scraper(subject, start, end, sleep_time=0): base_url = "http://export.arxiv.org/oai2" output = list() registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(base_url, registry) client.updateGranularity() records = client.listRecords(metadataPrefix='oai_dc', set="{}".format(subject), from_=start, until=end) for _, md, _ in records: # print md.getField("title") # checks for the case in 2010 when there is no title for something if md is not None: txt_dict = {"title": md["title"], "abstract": md["description"], "date": md["date"], "subject": md["subject"], "url": md["identifier"], "authors": md['creator']} output.append(txt_dict) time.sleep(sleep_time) return output
def list_oai_collections(self, community): """ Retrieve the header data for each record in the current community repo """ try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(community.repository.base_url, registry) records = client.listIdentifiers( metadataPrefix='oai_dc', set=community.identifier) except: community_collections = set() return """ Filter records to build list of collections in the community set """ community_collections = set() for i in records: for j in i.setSpec(): if j[:3] == 'col': community_collections.add(j) print len(community_collections) """ Build collection tuples (identifier, name) """ for i in community_collections: # print i # print community_collections set_data = [] set_data.append(i) # Store identifier set_data.append('Collection: %s'%i) # Store human readable name # print set_data self.collections.append(set_data)
def __init__(self, url): """Initialize client.""" registry = MetadataRegistry() registry.registerReader('oaf', self.oaf_reader) return super(OpenAireClient, self).__init__( url, metadata_registry=registry )
def insertAll(time, time2): registry = MetadataRegistry() registry.registerReader('arXivRaw', arXivRaw_reader) client = Client(URL, registry) client.updateGranularity() list = client.listRecords(metadataPrefix='arXivRaw', from_=time, until=time2) errors = 0 for a in list: #a = list.next() try: title = '\n'.join(a[1]['title']) sr2 = str(' '.join(a[1]['categories']).replace('-','_')).split(' ') abstract = '\n'.join(a[1]['abstract']) url = 'http://arxiv.org/abs/' + a[1]['id'][0] date = datetime.strptime(a[1]['created'][0], '%a, %d %b %Y %H:%M:%S %Z') authors = a[1]['authors'][0]# '; '.join(a[1]['keynames']) abstract = abstract + '\nBy: ' + authors + '\nIn: ' + ', '.join(sr2) print title print sr2 print abstract print url print date print authors insert(title + ' (' + authors + ')', str("fullarxiv"), url, abstract, date=date, cross_srs=sr2) except: print 'ERROR' print a errors = errors+1 print 'Completed with %s errors' % errors
def __init__(self, url, prefix=nsdl.LR_NSDL_PREFIX, reader=None, fields=None, namespaces=None, fieldMap=None): ''' Constructor ''' if fields == None: self._fields = nsdl.LR_NSDL_DC_FIELDS else: self._fields = fields if fieldMap == None: self._fieldMap = nsdl.NSDL_TO_LR_MAP else: self._fieldMap = fieldMap if namespaces == None: self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES else: self._namespaces = namespaces if reader == None: reader = MetadataReader(fields=self._fields, namespaces=self._namespaces) self._url = url self._registry = MetadataRegistry() self._prefix = prefix self._registry.registerReader(prefix, reader) self._client = Client(url, self._registry)
def _get_client_identifier(self, url, harvest_job=None): registry = MetadataRegistry() registry.registerReader(self.metadata_prefix_value, oai_dc_reader) client = oaipmh.client.Client(url, registry) try: identifier = client.identify() except (urllib2.URLError, urllib2.HTTPError,): if harvest_job: self._save_gather_error( 'Could not gather from %s!' % harvest_job.source.url, harvest_job) return client, None except socket.error: if harvest_job: errno, errstr = sys.exc_info()[:2] self._save_gather_error( 'Socket error OAI-PMH %s, details:\n%s' % (errno, errstr), harvest_job) return client, None except ValueError: # We have no source URL when importing via UI. return client, None except Exception as e: # Guard against miscellaneous stuff. Probably plain bugs. log.debug(traceback.format_exc(e)) return client, None return client, identifier
def _create_metadata_registry(self): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('oai_ddi', oai_ddi_reader) # TODO: Change back? registry.registerReader('dif', dif_reader2) # HDR registry.registerReader('datacite', datacite_reader) return registry
def __init__(self, configuration_file): """Constructor.""" self.oai_config = ConfigParser.SafeConfigParser() self.oai_config.read(configuration_file) self.current_config = 'ToulouseBis' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) self.client = Client(self._get_config_value('url'), registry)
def gather_stage(self, harvest_job): ''' The gather stage will recieve a HarvestJob object and will be responsible for: - gathering all the necessary objects to fetch on a later. stage (e.g. for a CSW server, perform a GetRecords request) - creating the necessary HarvestObjects in the database, specifying the guid and a reference to its source and job. - creating and storing any suitable HarvestGatherErrors that may occur. - returning a list with all the ids of the created HarvestObjects. :param harvest_job: HarvestJob object :returns: A list of HarvestObject ids ''' self._set_config(harvest_job.source.config) sets = [] harvest_objs = [] registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = oaipmh.client.Client(harvest_job.source.url, registry) try: identifier = client.identify() except urllib2.URLError: self._save_gather_error('Could not gather anything from %s!' % harvest_job.source.url, harvest_job) return None domain = identifier.repositoryName() group = Group.by_name(domain) if not group: group = Group(name=domain, description=domain) query = self.config['query'] if 'query' in self.config else '' try: for set in client.listSets(): identifier, name, _ = set if 'query' in self.config: if query in name: sets.append((identifier, name)) else: sets.append((identifier, name)) except NoSetHierarchyError: sets.append(('1', 'Default')) self._save_gather_error('Could not fetch sets!', harvest_job) for set_id, set_name in sets: harvest_obj = HarvestObject(job=harvest_job) harvest_obj.content = json.dumps( { 'set': set_id, \ 'set_name': set_name, \ 'domain': domain } ) harvest_obj.save() harvest_objs.append(harvest_obj.id) model.repo.commit() return harvest_objs
def test(request): URL = 'http://www.kulturarv.dk/ffrepox/OAIHandler' registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry) identifyResponse = client.identify() print dir(identifyResponse) #for record in client.listRecords(metadataPrefix='oai_dc'): # result += record return HttpResponse(identifyResponse.repositoryName())
def _registerReader(metadata_format): """ """ #TODO, check namespaces if metadata_format in ("metashare", "cmdi", "olac"): metadata_registry = MetadataRegistry() metadata_registry.registerReader(metadata_format, Reader()) return metadata_registry else: raise NotImplementedError("The %s metadata format is " \ "currently not supported." % metadata_format)
def test_get_record(self): metadata_reg = MetadataRegistry() metadata_reg.registerReader('oai_dc', oai_dc_reader) client = Client(config.get('ckan.site_url') + self.base_url, metadata_reg) res = self._oai_get_method_and_validate('?verb=ListIdentifiers&metadataPrefix=oai_dc&set=roger') urllib2.urlopen = mock.Mock(return_value=StringIO(res)) ids = client.listIdentifiers(metadataPrefix='oai_dc') offset = self.base_url + '?verb=GetRecord&identifier=%s&metadataPrefix=oai_dc' % ids.next().identifier() res = self.app.get(offset) self.assert_(oaischema.validate(etree.fromstring(res.body))) self.assert_("abraham" in res.body)
def harvest_oai_collection_records(self, collection): records = [] try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(collection.community.repository.base_url, registry) records = client.listRecords( metadataPrefix='oai_dc', set=collection.identifier) except: return return records
def clean(self): cleaned_data = super(CreateRepositoryForm, self).clean() try: registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(cleaned_data.get('base_url'), registry) server = client.identify() # set the repository name apply to model instance when saved. cleaned_data['name'] = server.repositoryName() except: raise ValidationError('Repository base url is invalid.') return cleaned_data
def test_resumption_identifiers(self): metadata_reg = MetadataRegistry() metadata_reg.registerReader('oai_dc', oai_dc_reader) urllib2.urlopen = realopen client = CKANServer() metadata_registry = metadata.MetadataRegistry() metadata_registry.registerReader('oai_dc', oai_dc_reader) metadata_registry.registerWriter('oai_dc', oai_dc_writer) serv = BatchingServer(client, metadata_registry=metadata_registry) client = ServerClient(serv, metadata_reg) recs = client.listIdentifiers(metadataPrefix='oai_dc') for rec in recs: self.assert_(rec)
def get_client(url, transforms): transforms = fix_transforms(transforms) registry = MetadataRegistry() c = Client(url, registry) metadata = c.listMetadataFormats() metadata[0] = [ 'fbb', 'http://www.kulturarv.dk/fbb/fbb.xsd', 'http://www.kulturarv.dk/fbb'] namespaces = dict((x[0], x[2]) for x in metadata) fields = dict((transform['field'], ('textList', transform['path'])) for transform in transforms) namespace = metadata[0][0] print namespaces,fields registry.registerReader(namespace, MetadataReader(fields=fields, namespaces=namespaces)) return c, namespace
def index_documents(main_url, database_name, url, reader, prefix, format): registry = MetadataRegistry() registry.registerReader(prefix, reader) client = Client(url, registry) return_stuff = [] for record in client.listRecords(metadataPrefix=prefix): r = record[1] value = format(r,record[0].identifier()) if value != None: return_stuff.append(value) if len(return_stuff) >= 10000: sync_files(main_url, database_name, return_stuff) return_stuff = [] sync_files(main_url, database_name, return_stuff)
def read_base_records(self): registry = MetadataRegistry() registry.registerReader('base_dc', base_dc_reader) client = Client('http://doai.io/oai', registry) for header, record, _ in client.listRecords(metadataPrefix='base_dc'): # only process records for which base was unsure if '2' not in record['oa']: continue # extract splash_url for link in record['identifier']: metadata = {'base_oa':''.join(record['oa']), 'splash_url':link, 'from_identifier':header.identifier()} yield self.filter_url(link,metadata, looking_for='any')
def setUp(self): super(BookMetadataTest, self).setUp() xml = path.join(path.dirname(__file__), 'files/lubie-kiedy-kobieta.xml') self.book = models.Book.from_xml_file(xml) xml = path.join(path.dirname(__file__), 'files/antygona.xml') self.book2 = models.Book.from_xml_file(xml) mr = MetadataRegistry() self.catalogue = Catalogue(mr) mr.registerWriter('oai_dc', oai_dc_writer) nsmap = {'oai_dc': NS_OAIDC, 'dc': NS_DC, 'xsi': NS_XSI} self.xml = XMLTreeServer(self.catalogue, mr, nsmap)
def index_documents(main_url, database_name, url, reader, prefix, format): registry = MetadataRegistry() registry.registerReader(prefix, reader) client = Client(url, registry) return_stuff = [] for record in client.listRecords(metadataPrefix=prefix): r = record[1] value = format(r, record[0].identifier()) if value != None: return_stuff.append(value) if len(return_stuff) >= 10000: sync_files(main_url, database_name, return_stuff) return_stuff = [] sync_files(main_url, database_name, return_stuff)
def update(self, from_date=None): self._log.info('Harvesting oai server: %s' % self._url) registry = MetadataRegistry() registry.registerReader(self._prefix, lambda el: el) client = Client(self._url, registry) try: for header, element, about in client.listRecords( metadataPrefix=self._prefix, from_=from_date): added = self._process_record(header, element) if added: yield self._get_id(header) except NoRecordsMatchError: pass super(OAIBasedContentProvider, self).update()
def __init__(self, dbName): global configs, dbs, session self.protocolMap = configs[dbName] self.db = dbs[dbName] session.database = self.db.id # get some generally useful stuff now self.baseURL = self.protocolMap.baseURL # get earliest datestamp in database q = cqlparse('rec.lastModificationDate > "%s"' % (str(datetime.datetime.utcfromtimestamp(0)))) # get UTC of the epoch as query term try: tl = self.db.scan(session, q, 1) except SRWDiagnostics.Diagnostic16: raise ConfigFileException('Index map for rec.lastModificationDate required in protocolMap: %s' % self.db.get_path(session, 'protocolMap').id) else: try: datestamp = tl[0][0] except IndexError: #something went wrong :( - use the epoch self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0) else: try: self.earliestDatestamp = datetime.datetime.strptime(datestamp, '%Y-%m-%dT%H:%M:%S') except ValueError: self.earliestDatestamp = datetime.datetime.strptime(datestamp, '%Y-%m-%d %H:%M:%S') self.repositoryName = self.protocolMap.title self.protocolVersion = self.protocolMap.version self.adminEmails = self.protocolMap.contacts self.deletedRecord = "no" # Cheshire3 does not support deletions at this time self.granularity = "YYYY-MM-DDThh:mm:ssZ" # finest level of granularity self.compression = [] # Cheshire3 does not support compressions at this time self.metadataRegistry = OaiMetadataRegistry()
def __init__(self, url, prefix=nsdl.LR_NSDL_PREFIX, reader=None, fields=None, namespaces=None, fieldMap=None): ''' Constructor ''' if fields == None: self._fields = nsdl.LR_NSDL_DC_FIELDS else: self._fields = fields if fieldMap == None: self._fieldMap = nsdl.NSDL_TO_LR_MAP else: self._fieldMap = fieldMap if namespaces == None: self._namespaces = nsdl.LR_NSDL_DC_NAMESPACES else: self._namespaces = namespaces if reader == None: reader = MetadataReader(fields = self._fields, namespaces = self._namespaces) self._url = url self._registry = MetadataRegistry() self._prefix = prefix self._registry.registerReader(prefix, reader) self._client = Client(url, self._registry)
def __init__(self, oaisource, day_granularity=False, *args, **kwargs): """ This sets up the paper source. :param oaisource: the OAISource to fetch from. :param day_granularity: should we use day-granular timestamps to fetch from the proxy or full timestamps (default: False, full timestamps) See the protocol reference for more information on timestamp granularity: https://www.openarchives.org/OAI/openarchivesprotocol.html """ super(OaiPaperSource, self).__init__(*args, **kwargs) if not oaisource.endpoint: raise ValueError('No OAI endpoint was configured for this OAI source.') self.registry = MetadataRegistry() self.registry.registerReader('oai_dc', oai_dc_reader) self.registry.registerReader('base_dc', base_dc_reader) self.client = Client(oaisource.endpoint, self.registry) self.client._day_granularity = day_granularity self.translators = { 'oai_dc': OAIDCTranslator(oaisource), 'base_dc': BASEDCTranslator(oaisource), }
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'): """ Create an OAI-PMH client, gather metadata and output it. """ total = num = 0 msg = "Fetching records between " + str(start) + " and " + str(end) sys.stderr.write(msg + "\n") # # Set up metadata readers # registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('qdc', qdc_reader) # registry.registerReader('rdf', rdf_reader) # no reader yet # registry.registerReader('ore', ore_reader) # no reader yet # registry.registerReader('mets', mets_reader) # no reader yet client = Client(URL, registry) records = client.listRecords(metadataPrefix='qdc', from_=start, until=end, set=set) for (h, m, a) in records: print h, m, a if not m: sys.stderr.write("o") continue total = total + 1 handle = m.getField('identifier') if not handle: sys.stderr.write("Record without a handle.\n") continue r = dict({ 'handle' : handle[0] }) for key in qdc_reader._fields.keys(): r[key] = m.getField(key) RECORDS.append(r) sys.stderr.write('.') sys.stderr.flush() num = num + 1 msg = "\nCollected " + str(num) + " records, out of " + str(total) sys.stderr.write('\n' + msg + '\n'); if options.store: pickle.dump(RECORDS, open(options.store, "wb"))
def update(self, from_date=None): self._log.info('Harvesting oai server: %s' % self._url) registry = MetadataRegistry() registry.registerReader(self._prefix, lambda el: el) client = Client(self._url, registry) try: for header, element, about in client.listRecords( metadataPrefix = self._prefix, from_ = from_date): added = self._process_record(header, element) if added: yield self._get_id(header) except NoRecordsMatchError: pass super(OAIBasedContentProvider, self).update()
def get_client(url, transforms): transforms = fix_transforms(transforms) registry = MetadataRegistry() c = Client(url, registry) metadata = c.listMetadataFormats() metadata[0] = [ 'fbb', 'http://www.kulturarv.dk/fbb/fbb.xsd', 'http://www.kulturarv.dk/fbb' ] namespaces = dict((x[0], x[2]) for x in metadata) fields = dict((transform['field'], ('textList', transform['path'])) for transform in transforms) namespace = metadata[0][0] print namespaces, fields registry.registerReader( namespace, MetadataReader(fields=fields, namespaces=namespaces)) return c, namespace
def _get_client_identifier(self, url, harvest_job=None): registry = MetadataRegistry() if 'metadata_formats' in self.config: for mdp in self.config['metadata_formats']: registry.registerReader(mdp, kata_oai_dc_reader) if self.metadata_prefix_value not in self.config['metadata_formats']: registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader) else: registry.registerReader(self.metadata_prefix_value, kata_oai_dc_reader) client = oaipmh.client.Client(url, registry) try: identifier = client.identify() client.updateGranularity() #quickfix: to set corrent datetime granularity, updateGranularity has to be called except (urllib2.URLError, urllib2.HTTPError) as err: log.debug("Error occurred: {0}".format(err)) if harvest_job: self._save_gather_error('Could not gather from %s!' % harvest_job.source.url, harvest_job) return client, None except socket.error: if harvest_job: errno, errstr = sys.exc_info()[:2] self._save_gather_error('Socket error OAI-PMH %s, details:\n%s' % (errno, errstr), harvest_job) return client, None except ValueError: # We have no source URL when importing via UI. return client, None except Exception as e: # Guard against miscellaneous stuff. Probably plain bugs. log.debug(traceback.format_exc(e)) return client, None return client, identifier
def processItems(): oai_oi_reader = MetadataReader( fields={ 'title': ('textList', 'oai_oi:oi/oi:title/text()'), 'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'), 'creator': ('textList', 'oai_oi:oi/oi:creator/text()'), 'subject': ('textList', 'oai_oi:oi/oi:subject/text()'), 'description': ('textList', 'oai_oi:oi/oi:description/text()'), 'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'), 'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'), 'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'), 'date': ('textList', 'oai_oi:oi/oi:date/text()'), 'type': ('textList', 'oai_oi:oi/oi:type/text()'), 'extent': ('textList', 'oai_oi:oi/oi:extend/text()'), 'medium': ('textList', 'oai_oi:oi/oi:medium/text()'), 'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'), 'source': ('textList', 'oai_oi:oi/oi:source/text()'), 'language': ('textList', 'oai_oi:oi/oi:language/text()'), 'references': ('textList', 'oai_oi:oi/oi:references/text()'), 'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'), 'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'), 'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'), 'license': ('textList', 'oai_oi:oi/oi:license/text()'), #Zitten er niet in #'rights': ('textList', 'oai_oi:oi/oi:rights/text()'), #'relation': ('textList', 'oai_oi:oi/oi:relation/text()'), #'coverage': ('textList', 'oai_oi:oi/oi:coverage/text()'), #'format': ('textList', 'oai_oi:oi/oi:format/text()'), }, namespaces={ 'oi': 'http://www.openbeelden.nl/oai/', 'oai_oi': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcterms': 'http://purl.org/dc/terms', }) url = u'http://www.openbeelden.nl/feeds/oai/' registry = MetadataRegistry() registry.registerReader('oai_oi', oai_oi_reader) client = Client(url, registry) for record in client.listRecords(metadataPrefix='oai_oi'): processItem(record)
def writeMetadata(self, metadata_prefix, element, metadata): try: return MetadataRegistry.writeMetadata(self, metadata_prefix, element, metadata) except KeyError as key_error: try: return self.defaultWriter(element, metadata) except AttributeError: raise key_error
def readMetadata(self, metadata_prefix, element): try: return MetadataRegistry.readMetadata(self, metadata_prefix, element) except KeyError as key_error: try: return self.defaultReader(element) except AttributeError: raise key_error
def processItems(): oai_oi_reader = MetadataReader( fields={ 'title': ('textList', 'oai_oi:oi/oi:title/text()'), 'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'), 'creator': ('textList', 'oai_oi:oi/oi:creator/text()'), 'subject': ('textList', 'oai_oi:oi/oi:subject/text()'), 'description': ('textList', 'oai_oi:oi/oi:description/text()'), 'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'), 'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'), 'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'), 'date': ('textList', 'oai_oi:oi/oi:date/text()'), 'type': ('textList', 'oai_oi:oi/oi:type/text()'), 'extent': ('textList', 'oai_oi:oi/oi:extend/text()'), 'medium': ('textList', 'oai_oi:oi/oi:medium/text()'), 'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'), 'source': ('textList', 'oai_oi:oi/oi:source/text()'), 'language': ('textList', 'oai_oi:oi/oi:language/text()'), 'references': ('textList', 'oai_oi:oi/oi:references/text()'), 'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'), 'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'), 'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'), 'license': ('textList', 'oai_oi:oi/oi:license/text()'), #Zitten er niet in #'rights': ('textList', 'oai_oi:oi/oi:rights/text()'), #'relation': ('textList', 'oai_oi:oi/oi:relation/text()'), #'coverage': ('textList', 'oai_oi:oi/oi:coverage/text()'), #'format': ('textList', 'oai_oi:oi/oi:format/text()'), }, namespaces={ 'oi' : 'http://www.openbeelden.nl/oai/', 'oai_oi' : 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc' : 'http://purl.org/dc/elements/1.1/', 'dcterms' : 'http://purl.org/dc/terms', } ) url = u'http://www.openbeelden.nl/feeds/oai/' registry = MetadataRegistry() registry.registerReader('oai_oi', oai_oi_reader) client = Client(url, registry) for record in client.listRecords(metadataPrefix='oai_oi'): processItem(record)
def iter_items(self, partition): """ Partition is an OAI-PMH endpoint """ # source = "oai:%s" % partition registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(partition, registry) for record in client.listRecords(metadataPrefix='oai_dc'): header, metadata, _ = record if header.isDeleted(): continue # _id = header.identifier() # date = header.datestamp() meta = metadata.getMap() # TODO: there are much validation and heuristics to be done here! # format0 = (meta.get("format") or [None])[0] # if not format0: # continue # if format0 not in ("application/pdf", ): # continue url0 = (meta.get("identifier") or [None])[0] if not url0: continue title0 = (meta.get("title") or [""])[0].encode("utf-8") desc0 = (meta.get("description") or [""])[0].encode("utf-8") # TODO: validate that the url0 is not on another domain?! yield url0, {}, "html", 2, """ <html><head><title>%s</title></head><body>%s</body></html> """ % (title0, desc0)
def writeMetadata(self, metadata_prefix, element, metadata): try: return MetadataRegistry.writeMetadata(self, metadata_prefix, element, metadata ) except KeyError as key_error: try: return self.defaultWriter(element, metadata) except AttributeError: raise key_error
def import_stage(self, harvest_object): """ The import stage will receive a HarvestObject object and will be responsible for: - performing any necessary action with the fetched object (e.g create a CKAN package). Note: if this stage creates or updates a package, a reference to the package must be added to the HarvestObject. Additionally, the HarvestObject must be flagged as current. - creating the HarvestObject - Package relation (if necessary) - creating and storing any suitable HarvestObjectErrors that may occur. - returning True if everything went as expected, False otherwise. :param harvest_object: HarvestObject object :returns: True if everything went right, False if errors were found """ # Do common tasks and then call different methods depending on what # kind of info the harvest object contains. self._set_config(harvest_object.job.source.config) ident = json.loads(harvest_object.content) registry = MetadataRegistry() registry.registerReader(self.metadata_prefix_value, oai_dc_reader) client = oaipmh.client.Client(harvest_object.job.source.url, registry) domain = ident['domain'] group = Group.get(domain) # Checked in gather_stage so exists. try: if ident['fetch_type'] == 'record': return self._fetch_import_record( harvest_object, ident, client, group) if ident['fetch_type'] == 'set': return self._fetch_import_set( harvest_object, ident, client, group) # This should not happen... log.error('Unknown fetch type: %s' % ident['fetch_type']) except Exception as e: # Guard against miscellaneous stuff. Probably plain bugs. # Also very rare exceptions we haven't seen yet. log.debug(traceback.format_exc(e)) return False
def setupOAIPMHConnection(self): oai_oi_reader = MetadataReader( fields={ 'title': ('textList', 'oai_oi:oi/oi:title/text()'), 'alternative': ('textList', 'oai_oi:oi/oi:alternative/text()'), 'creator': ('textList', 'oai_oi:oi/oi:creator/text()'), 'subject': ('textList', 'oai_oi:oi/oi:subject/text()'), 'description': ('textList', 'oai_oi:oi/oi:description/text()'), 'abstract': ('textList', 'oai_oi:oi/oi:abstract/text()'), 'publisher': ('textList', 'oai_oi:oi/oi:publisher/text()'), 'contributor': ('textList', 'oai_oi:oi/oi:contributor/text()'), 'date': ('textList', 'oai_oi:oi/oi:date/text()'), 'type': ('textList', 'oai_oi:oi/oi:type/text()'), 'extent': ('textList', 'oai_oi:oi/oi:extent/text()'), 'medium': ('textList', 'oai_oi:oi/oi:medium/text()'), 'identifier': ('textList', 'oai_oi:oi/oi:identifier/text()'), 'source': ('textList', 'oai_oi:oi/oi:source/text()'), 'language': ('textList', 'oai_oi:oi/oi:language/text()'), 'references': ('textList', 'oai_oi:oi/oi:references/text()'), 'spatial': ('textList', 'oai_oi:oi/oi:spatial/text()'), 'attributionName': ('textList', 'oai_oi:oi/oi:attributionName/text()'), 'attributionURL': ('textList', 'oai_oi:oi/oi:attributionURL/text()'), 'license': ('textList', 'oai_oi:oi/oi:license/text()') }, namespaces={ 'oai_oi': 'http://www.openbeelden.nl/feeds/oai/', #'http://www.openarchives.org/OAI/2.0/oai_oi/', 'oi': 'http://www.openbeelden.nl/oai/' } ) URL = 'http://www.openbeelden.nl/feeds/oai/' #Initialize the OAI client self.registry = MetadataRegistry() self.registry.registerReader('oai_oi', oai_oi_reader) self.client = Client(URL, self.registry) #Test if the connection to the OAI-PMH provider works x = self.client.updateGranularity() x = self.client.identify() print 'identity %s' % x.repositoryName() print 'identity %s' % x.protocolVersion() print 'identity %s' % x.baseURL() """ for s in client.listSets(): print s """ #initialize the OpenSKOSHandler self.openSKOSHandler = OpenSKOSHandler()
def __init__(self, session, configs, dbs, dbName): self.session = session self.protocolMap = configs[dbName] self.db = dbs[dbName] session.database = self.db.id # Get some generally useful stuff now self.baseURL = self.protocolMap.baseURL # Get earliest datestamp in database - UTC of the epoch as query term q = cqlparse('rec.lastModificationDate > "%s"' '' % (str(datetime.datetime.utcfromtimestamp(0)))) try: tl = self.db.scan(session, q, 1) except SRWDiagnostics.Diagnostic16: raise ConfigFileException( 'Index map for ' 'rec.lastModificationDate required ' 'in protocolMap: %s' '' % self.db.get_path(session, 'protocolMap').id) else: try: datestamp = tl[0][0] except IndexError: # Something went wrong :( - use the epoch self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0) else: try: self.earliestDatestamp = datetime.datetime.strptime( datestamp, '%Y-%m-%dT%H:%M:%S') except ValueError: self.earliestDatestamp = datetime.datetime.strptime( datestamp, '%Y-%m-%d %H:%M:%S') self.repositoryName = self.protocolMap.title self.protocolVersion = self.protocolMap.version self.adminEmails = self.protocolMap.contacts # Check for deletion support recordStore = self.db.get_path(session, 'recordStore') deletions = recordStore.get_setting(session, 'storeDeletions') # Cheshire3 cannot guarantee that deletions will persist self.deletedRecord = "transient" if deletions else "no" # Finest level of granularity self.granularity = "YYYY-MM-DDThh:mm:ssZ" # Cheshire3 does not support compressions at this time self.compression = [] self.metadataRegistry = OaiMetadataRegistry()
def __init__(self, url=None, **kwargs): self.base_url = kwargs.pop('base_url', None) self.oai_path = kwargs.pop('oai_path', None) self.oai_enabled = bool(kwargs.pop('oai_enabled', True)) self.sword_enabled = bool(kwargs.pop('sword_enabled', False)) if url is not None: warn( 'The url paramater will not be supported in version 3, ' 'use base_url and oai_path instead', DeprecationWarning) if (self.base_url and url.startswith(self.base_url) and self.oai_path is None): self.oai_path = url.replace(self.base_url, '', 1).lstrip('/') elif not self.base_url: if self.oai_path is None: self.oai_path = 'dspace-oai/request' if url.endswith(self.oai_path): self.base_url = url[:-(len(self.oai_path) + 1)] if self.base_url is None: raise ValueError('base_url argument must be specified') if not 'metadata_registry' in kwargs: kwargs['metadata_registry'] = MetadataRegistry() kwargs['metadata_registry'].registerReader('mets', dspace_mets_reader) if self.sword_enabled: skwargs = {'base_url': self.base_url} for key in kwargs.keys(): if key.startswith('sword_'): skwargs[key[6:]] = kwargs.pop(key) self.sword = SwordService(**skwargs) if self.oai_enabled: self.oai = Client('/'.join(( self.base_url, self.oai_path, )), **kwargs) self.identifier_base = self._extractIdentifierBase(self.base_url)
def scrape(start=START, end=END, set=SET_THESIS, type='Thesis'): """ Create an OAI-PMH client, gather metadata and output it. """ total = num = 0 msg = "Fetching records between " + str(start) + " and " + str(end) sys.stderr.write(msg + "\n") # # Set up metadata readers # registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('qdc', qdc_reader) # registry.registerReader('rdf', rdf_reader) # no reader yet # registry.registerReader('ore', ore_reader) # no reader yet # registry.registerReader('mets', mets_reader) # no reader yet client = Client(URL, registry) records = client.listRecords(metadataPrefix='qdc', from_=start, until=end, set=set) for (h, m, a) in records: print h, m, a if not m: sys.stderr.write("o") continue total = total + 1 handle = m.getField('identifier') if not handle: sys.stderr.write("Record without a handle.\n") continue r = dict({'handle': handle[0]}) for key in qdc_reader._fields.keys(): r[key] = m.getField(key) RECORDS.append(r) sys.stderr.write('.') sys.stderr.flush() num = num + 1 msg = "\nCollected " + str(num) + " records, out of " + str(total) sys.stderr.write('\n' + msg + '\n') if options.store: pickle.dump(RECORDS, open(options.store, "wb"))
def __init__(self, dbName): global configs, dbs, session self.protocolMap = configs[dbName] self.db = dbs[dbName] session.database = self.db.id # get some generally useful stuff now self.baseURL = self.protocolMap.baseURL # get earliest datestamp in database q = cqlparse('rec.lastModificationDate > "%s"' % (str(datetime.datetime.utcfromtimestamp(0))) ) # get UTC of the epoch as query term try: tl = self.db.scan(session, q, 1) except SRWDiagnostics.Diagnostic16: raise ConfigFileException( 'Index map for rec.lastModificationDate required in protocolMap: %s' % self.db.get_path(session, 'protocolMap').id) else: try: datestamp = tl[0][0] except IndexError: #something went wrong :( - use the epoch self.earliestDatestamp = datetime.datetime.utcfromtimestamp(0) else: try: self.earliestDatestamp = datetime.datetime.strptime( datestamp, '%Y-%m-%dT%H:%M:%S') except ValueError: self.earliestDatestamp = datetime.datetime.strptime( datestamp, '%Y-%m-%d %H:%M:%S') self.repositoryName = self.protocolMap.title self.protocolVersion = self.protocolMap.version self.adminEmails = self.protocolMap.contacts self.deletedRecord = "no" # Cheshire3 does not support deletions at this time self.granularity = "YYYY-MM-DDThh:mm:ssZ" # finest level of granularity self.compression = [ ] # Cheshire3 does not support compressions at this time self.metadataRegistry = OaiMetadataRegistry()
def _initialise_client(self, url): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('ore', oai_ore_reader) logging.info('Initialising OAI client with URL [%s]', url) return Client(url, registry)
def indexCollection(URL, url_base, metadata_prefix, collection, action): #pull data from OAI endpoint registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) client = Client(URL, registry, force_http_get=True) harvested_data = [] for record in client.listRecords(metadataPrefix=metadata_prefix, set=collection): if not record[0].isDeleted(): fields = record[1].getMap() if fields['subject']: fields['subjects'] = fields['subject'][0].split(';') del fields['subject'] fields['set'] = record[0].setSpec() identifier = record[0].identifier().split(':')[2] fields[ 'image_url_base'] = url_base + '/digital/iiif/' + identifier + '/' harvested_data.append(fields) if action is 'reindex': es.indices.delete(index='digital_collection_recs', ignore=[400, 404]) mapping = { "mappings": { "_doc": { "properties": { "title": { "type": "text" }, "creator": { "type": "text" }, "subjects": { "type": "text" }, "description": { "type": "text" }, "publisher": { "type": "text" }, "contributor": { "type": "text" }, "date": { "type": "text" }, "type": { "type": "text", "fielddata": "true" }, "format": { "type": "text", "fielddata": "true" }, "identifier": { "type": "text" }, "source": { "type": "text" }, "language": { "type": "text", "fielddata": "true" }, "relation": { "type": "text" }, "coverage": { "type": "text" }, "rights": { "type": "text" }, "set": { "type": "text", "fielddata": "true" }, "image_url_base": { "type": "text" } } } } } es.indices.create(index='digital_collection_recs', body=mapping) helpers.bulk(es, harvested_data, index='digital_collection_recs', doc_type='_doc') return "success"
'type', 'format', 'identifier', 'source', 'language', 'relation', 'coverage', 'rights', ]: for value in map.get(name, []): e = SubElement(element, nsdc(name), nsmap=nsmap) e.text = value for name in ['hasPart', 'isPartOf']: for value in map.get(name, []): e = SubElement(element, nsdcterms(name), nsmap=nsmap) e.text = value metadata_registry = MetadataRegistry() metadata_registry.registerWriter('oai_dc', fbc_oai_dc_writer) metadata_registry.registerWriter('qdc', qdc_writer) server = ServerBase(Catalogue(metadata_registry), metadata_registry, {'topxsi': NS_XSI}) def oaipmh(request): resp = server.handleRequest(request.GET) return HttpResponse(resp, content_type='application/xml')
def harvest(metadata_set, dest_folder, log_file, content_type, from_date, until_date): ############################# # ### FILESYSTEM CHECKS ### # ############################# try: if not os.path.isdir(dest_folder): os.makedirs(dest_folder) # Verify write permission inside the folder: except BaseException as e: log.error(str(e)) log.exit("Unable to create destination folder: %s" % dest_folder) try: test_path = os.path.join(dest_folder, '__test_permissions__') os.makedirs(test_path) os.rmdir(test_path) except BaseException as e: log.error(str(e)) log.exit("Unable to use destination folder: %s" % dest_folder) try: log_handle = open(log_file, 'a+') log_handle.close() except BaseException as e: log.error(str(e)) log.exit("Unable to create log_file: %s" % log_file) ################################# # ### OAI-PMH CONFIGURATION ### # ################################# URL = 'https://node0-d-efg.d4science.org/efg/mvc/oai/oai.do' metadata_prefix = 'efg' ################################### # ### OPEN OAI-PMH CONNECTION ### # ################################### registry = MetadataRegistry() registry.registerReader(metadata_prefix, oai_dc_reader) #print ("URL=" + str(URL)) client = Client(URL, registry) #################################### # ### CHECK IF THIS SET EXISTS ### # #################################### set_found = False for s in client.listSets(): if metadata_set == s[0]: set_found = True if not set_found: log.exit("Unable to find this set: %s" % metadata_set) ############################# # ### RETRIEVE METADATA ### # ############################# if from_date is not None: from_date = parse_date(from_date) if from_date is None: log.exit("Unable to convert from date") if until_date is not None: until_date = parse_date(until_date) if until_date is None: log.exit("Unable to convert until date") report_data = { 'downloaded': 0, 'filtered': 0, 'saved': 0, 'saved_files': [], 'missing_sourceid': [], 'wrong_content_type': [] } timestamp = int(1000 * time.time()) log.info("Retrieving records for %s..." % metadata_set) try: records = client.listRecords( metadataPrefix=metadata_prefix, set=metadata_set, from_=from_date, until=until_date) except NoRecordsMatchError as e: log.exit(e) log.info("Records retrieved, extracting...") try: for record in records: element = record[1].element() # Obtained eTree is based on namespaced XML # Read: 19.7.1.6. Parsing XML with Namespaces # https://docs.python.org/2/library/xml.etree.elementtree.html # find(match) # Finds the first subelement matching match. # match may be a tag name or path. # Returns an element instance or None. # findall(match) # Finds all matching subelements, by tag name or path. # Returns a list containing all matching elements # in document order. report_data['downloaded'] += 1 if report_data['downloaded'] % 100 == 0: print('.', end='', flush=True) if report_data['downloaded'] % 5000 == 0: print( ' %s downloaded - %s saved' % ( report_data['downloaded'], report_data['saved'] ), flush=True) efgEntity = element.find(tag("efgEntity")) if efgEntity is None: # log.warning("efgEntity not found, skipping record") continue avcreation = efgEntity.find(tag("avcreation")) nonavcreation = efgEntity.find(tag("nonavcreation")) if avcreation is not None: manifestation = avcreation.find(tag("avManifestation")) recordSource = avcreation.find(tag("recordSource")) keywords = avcreation.findall(tag("keywords")) title_el = avcreation.find(tag("identifyingTitle")) title = (title_el.text if title_el is not None else "Unknown title") elif nonavcreation is not None: manifestation = nonavcreation.find(tag("nonAVManifestation")) recordSource = nonavcreation.find(tag("recordSource")) keywords = nonavcreation.findall(tag("keywords")) title_el = nonavcreation.find(tag("title")) title = (title_el.find(tag("text")).text if title_el is not None else "Unknown title") else: title = "Unknown title" # log.warning("(non)avcreation not found, skipping record") continue filter_keyword = "IMediaCities" is_good = False for keyword in keywords: term = keyword.find(tag("term")) if term.text == filter_keyword: is_good = True break if not is_good: continue report_data['filtered'] += 1 if manifestation is None: report_data['missing_sourceid'].append(title) # log.warning("avManifestation not found, skipping record") continue if content_type is not None: content_type = content_type.lower() item = manifestation.find(tag("item")) if item is None: # missing <item> => type cannot be found report_data['wrong_content_type'].append(title) continue item_type = item.find(tag("type")) if item_type is None: # missing <type> report_data['wrong_content_type'].append(title) continue if item_type.text.lower() != content_type: # wrong type report_data['wrong_content_type'].append(title) continue # ATTENZIONE: il sourceID va preso dal recordSource che sta # sotto avcreation/nonavcreation e NON sotto # avManifestation/nonAVManifestation #recordSource = manifestation.find(tag("recordSource")) if recordSource is None: report_data['missing_sourceid'].append(title) # log.warning("recordSource not found, skipping record") continue sourceID = recordSource.find(tag("sourceID")) if sourceID is None: report_data['missing_sourceid'].append(title) # log.warning("sourceID not found, skipping record") continue content = etree.tostring(efgEntity, pretty_print=False) # id_text = urllib.parse.quote_plus(sourceID.text.strip()) # replace non alpha-numeric characters with a dash id_text = re.sub(r'[\W_]+', '-', sourceID.text.strip()) # fine cinzia filename = "%s_%s_%s.xml" % ( metadata_set, id_text, timestamp ) filepath = os.path.join(dest_folder, filename) # with open(filepath, 'wb') as f: with codecs.open(filepath, 'wb', "utf-8") as f: f.write(content.decode('utf-8')) # OLD #with codecs.open(filepath, 'wb', "utf-8") as f: # f.write(html.unescape(content.decode('utf-8'))) report_data['saved'] += 1 report_data['saved_files'].append(filename) except NoRecordsMatchError as e: log.warning("No more records after filtering?") log.warning(e) # ################### # Write report file # ################### # the procedure writes a report file containing the results # of the harvesting: # the list of records that do not contain the record ID # (by writing the content of the element title) with open(log_file, 'w+') as f: json.dump(report_data, f) f.close() # Just to close previous dot line print("") log.info(""" %s records from set [%s] downloaded open log file [%s] for details """ % (report_data['saved'], metadata_set, log_file) )
def get_names(dataname): record_prefix = "rdf:RDF/edm:ProvidedCHO" # Modidy/add Xpath mappings to get other fields and other objects (agent, place etc) edm_reader = MetadataReader( fields={ 'title': ('textList', record_prefix + '/dc:title/text()'), 'creator': ('textList', record_prefix + '/dc:creator/text()'), 'subject': ('textList', record_prefix + '/dc:subject/text()'), 'description': ('textList', record_prefix + '/dc:description/text()'), 'publisher': ('textList', record_prefix + '/dc:publisher/text()'), 'contributor': ('textList', record_prefix + '/dc:contributor/text()'), 'date': ('textList', record_prefix + '/dc:date/text()'), 'type': ('textList', record_prefix + '/dc:type/text()'), 'format': ('textList', record_prefix + '/dc:format/text()'), 'identifier': ('textList', record_prefix + '/dc:identifier/text()'), 'source': ('textList', record_prefix + '/dc:source/text()'), 'language': ('textList', record_prefix + '/dc:language/text()'), 'relation': ('textList', record_prefix + '/dc:relation/text()'), 'coverage': ('textList', record_prefix + '/dc:coverage/text()'), 'rights': ('textList', record_prefix + '/dc:rights/text()'), 'spatial': ('textList', record_prefix + '/dc:spatial/text()'), 'objectId': ('textList', record_prefix + '/@rdf:about'), }, namespaces={ 'oai_dc': 'http://www.openarchives.org/OAI/2.0/oai_dc/', 'dc': 'http://purl.org/dc/elements/1.1/', 'dcterms': 'http://purl.org/dc/terms/', 'dct': 'http://purl.org/dc/terms/', 'edm': 'http://www.europeana.eu/schemas/edm/', 'foaf': 'http://xmlns.com/foaf/0.1/', 'owl': 'http://www.w3.org/2002/07/owl#', 'rdf': 'http://www.w3.org/1999/02/22-rdf-syntax-ns#', 'rdfs': 'http://www.w3.org/2000/01/rdf-schema#', 'skos': 'http://www.w3.org/2004/02/skos/core#', 'xsi': 'http://www.w3.org/2001/XMLSchema-instance', 'ore': 'http://www.openarchives.org/ore/terms/' }) names = [] identifier = [] if __name__ == "__main__": URL = 'https://data.jhn.ngo/oai' registry = MetadataRegistry() registry.registerReader('edm', edm_reader) client = Client(URL, registry) # To harvest specific dataset, use "set" parameter: set='AIUJE1_MARC21' for record in client.listRecords(metadataPrefix='edm', set=dataname): output = record[1].getMap() if output['creator'] != []: names.append([output['creator'][0]]) identifier.append( [output['creator'][0], output['objectId'][0]]) if output['contributor'] != []: names.append([output['contributor'][0]]) identifier.append( [output['contributor'][0], output['objectId'][0]]) print(names) return identifier
def _create_metadata_registry(self): registry = MetadataRegistry() registry.registerReader('oai_dc', oai_dc_reader) registry.registerReader('oai_ddi', oai_ddi_reader) return registry