def get_all_records(self): """Retrieves all available OAI records. Records are retrieved by first requesting identifiers via the ``ListIdentifiers`` verb. For each identifier, the record is requested by using the ``GetRecord`` verb. :returns: a generator that yields a tuple for each record, a tuple consists of the content-type and the content as a string. """ resumption_token = None while True: req_params = {'verb': 'ListRecords'} if resumption_token: req_params['resumptionToken'] = resumption_token req_params['metadataPrefix'] = self.metadata_prefix resp = self.oai_call(req_params) tree = self.parse_oai_response(resp) records = tree.xpath('.//oai:ListRecords/oai:record', namespaces=self.namespaces) for record in records: yield 'application/xml', etree.tostring(record) resumption_token = tree.find('.//oai:resumptionToken', namespaces=self.namespaces).text # According to the OAI spec, we reached the last page of the # list if the 'resumptionToken' element is empty if not resumption_token: log.debug('resumptionToken empty, done fetching list') break
def adlib_search_call(self, params={}): """Makes a call to the Adlib endpoint and returns the response as a string. :type params: dict :param params: a dictonary sent as arguments in the query string :rtype: lxml.etree """ default_params = { 'database': self.adlib_database, 'search': self.adlib_query, 'xmltype': self.adlib_xmltype, 'limit': self.per_page_limit, 'startfrom': 0 } default_params.update(params) log.debug('Getting %s (params: %s)' % (self.adlib_base_url, default_params)) r = self.http_session.get( self.adlib_base_url, params=default_params ) r.raise_for_status() return etree.fromstring(r.content)
def get_all_records(self): """Retrieves all available OAI records. This method has to be specifically overwritten for OpenBeelden, as they encode the metadataPrefix in their resumption token, rather than having a separate HTTP GET parameter. """ resumption_token = None while True: req_params = {'verb': 'ListRecords'} if resumption_token: req_params['resumptionToken'] = resumption_token # This fixes the culprit else: req_params['metadataPrefix'] = self.metadata_prefix resp = self.oai_call(req_params) tree = self.parse_oai_response(resp) records = tree.xpath('.//oai:ListRecords/oai:record', namespaces=self.namespaces) for record in records: yield 'application/xml', etree.tostring(record) resumption_token = tree.find('.//oai:resumptionToken', namespaces=self.namespaces).text # According to the OAI spec, we reached the last page of the # list if the 'resumptionToken' element is empty if not resumption_token: log.debug('resumptionToken empty, done fetching list') break
def opensearch_call(self, params={}): """Makes a call to the Opensearch endpoint and returns an XML tree. :type params: dict :param params: a dictonary sent as arguments in the query string :rtype: lxml.etree """ log.debug('Getting %s (params: %s)' % (self.url, params)) r = self.http_session.get(self.url, params=params) # In case a server error is returned (for example, a gateway # time-out), we retry the same request for a number of times max_retries = 10 retried = 0 while r.status_code >= 500 and retried <= max_retries: log.warning('Received server error (status %s), retry %s of %s' % (r.status_code, retried + 1, max_retries)) sleep_s = retried + 1 log.warning('Sleeping %s second(s) before retrying...' % sleep_s) sleep(sleep_s) r = self.http_session.get(self.url, params=params) retried += 1 r.raise_for_status() return etree.fromstring(r.content)
def call(self, url, headers, data): log.debug('Getting %s (headers: %s, data: %s)' % (self.url, headers, data)) r = requests.post(self.url, data=data, headers=headers) # In case a server error is returned (for example, a gateway # time-out), we retry the same request for a number of times max_retries = 10 retried = 0 while r.status_code >= 500 and retried <= max_retries: log.warning('Received server error (status %s), retry %s of %s' % (r.status_code, retried + 1, max_retries)) sleep_s = retried + 1 log.warning('Sleeping %s second(s) before retrying...' % sleep_s) sleep(sleep_s) r = requests.post(url, data=data, headers=headers) retried += 1 r.raise_for_status() return r.json()
def api_call(self, url, params={}): params.update(key=self.source_definition['rijksmuseum_api_key'], format='json') url = '%s%s' % (self.api_base_url, url) log.debug('Getting %s (params: %s)' % (url, params)) r = self.http_session.get(url, params=params) r.raise_for_status() return r.json()
def oai_call(self, params={}): """Makes a call to the OAI endpoint and returns the response as a string. :type params: dict :param params: a dictonary sent as arguments in the query string """ log.debug('Getting %s (params: %s)' % (self.oai_base_url, params)) r = self.http_session.get(self.oai_base_url, params=params) r.raise_for_status() return r.content
def api_call(self, cursor, params={}): params.update( wskey=self.source_definition['api_key'], query='DATA_PROVIDER%3A%22Benaki+Museum%22', cursor=cursor ) url = '%s?wskey=%s&query=%s&cursor=%s' % (self.api_base_url, self.source_definition['api_key'], 'DATA_PROVIDER%3A%22Benaki+Museum%22', cursor) log.debug('Getting %s (params: %s)' % (url, params)) #r = self.http_session.get(url, params=params) r = self.http_session.get(url) r.raise_for_status() return r.json()
def opensearch_call(self, params={}): """Makes a call to the Opensearch endpoint and returns an XML tree. :type params: dict :param params: a dictonary sent as arguments in the query string :rtype: lxml.etree """ log.debug('Getting %s (params: %s)' % (self.url, params)) r = self.http_session.get(self.url, params=params) r.raise_for_status() return etree.fromstring(r.content)
def commons_api_call(self, image_name): """Use the Wikimedia Commons API to retrieve media metadata from Commons as XML. The response is returned as a string. :type image_name: str :param image_name: the title of the Commons page containing the image (e.g. ``File:Studioportretten.jpg``) """ params = { 'image': image_name, 'forcehtml': '', } log.debug('Getting %s (params: %s)' % (self.commons_api_url, params)) r = self.http_session.get(self.commons_api_url, params=params) r.raise_for_status() return r.content
def get_all_records(self): """Retrieves all available OAI records. :returns: a generator that yields a tuple for each record, a tuple consists of the content-type and the content as a string. """ resumption_token = None while True: req_params = {'verb': 'ListRecords'} if resumption_token: req_params['resumptionToken'] = resumption_token req_params['metadataPrefix'] = self.metadata_prefix resp = self.oai_call(req_params) tree = self.parse_oai_response(resp) records = tree.xpath('.//oai:ListRecords/oai:record', namespaces=self.namespaces) for record in records: # check if the record was deleted header = record.find('oai:header[@status="deleted"]', namespaces=self.namespaces) if header is not None: log.debug( 'Header specifies that the record is deleted, skipping.' ) continue yield 'application/xml', etree.tostring(record) # According to the OAI spec, we reached the last page of the # list if the 'resumptionToken' element is empty. Some OAI # implementations completely drop the 'resumptionToken' # element on the last try: resumption_token = tree.find('.//oai:resumptionToken', namespaces=self.namespaces).text except AttributeError: resumption_token = None if not resumption_token: log.debug('resumptionToken empty, done fetching list') break
def get_all_records(self): """Retrieves all available OAI records. :returns: a generator that yields a tuple for each record, a tuple consists of the content-type and the content as a string. """ resumption_token = None while True: req_params = {'verb': 'ListRecords'} if resumption_token: req_params['resumptionToken'] = resumption_token req_params['metadataPrefix'] = self.metadata_prefix resp = self.oai_call(req_params) tree = self.parse_oai_response(resp) records = tree.xpath('.//oai:ListRecords/oai:record', namespaces=self.namespaces) for record in records: # check if the record was deleted header = record.find('oai:header[@status="deleted"]', namespaces=self.namespaces) if header is not None: log.debug('Header specifies that the record is deleted, skipping.') continue yield 'application/xml', etree.tostring(record) # According to the OAI spec, we reached the last page of the # list if the 'resumptionToken' element is empty. Some OAI # implementations completely drop the 'resumptionToken' # element on the last try: resumption_token = tree.find('.//oai:resumptionToken', namespaces=self.namespaces).text except AttributeError: resumption_token = None if not resumption_token: log.debug('resumptionToken empty, done fetching list') break
def oai_call(self, params={}): """Makes a call to the OAI endpoint and returns the response as a string. :type params: dict :param params: a dictonary sent as arguments in the query string """ # Add the set variable to the parameters (if available) if self.oai_set: params['set'] = self.oai_set # Remove set and metadataPrefix, when a resumptionToken is present if 'resumptionToken' in params: if 'set' in params: del params['set'] log.debug('Getting %s (params: %s)' % (self.oai_base_url, params)) r = self.http_session.get(self.oai_base_url, params=params) r.raise_for_status() return r.content
def wikimedia_api_call(self, params={}): """Calls the MediaWiki API and returns the response as a string. :type params: dict :param params: a dictonary sent as arguments in the query string """ req_params = { 'action': 'query', 'list': 'categorymembers', 'cmtype': 'file', 'cmtitle': self.wikimedia_category, 'cmlimit': 250, 'format': 'xml' } req_params.update(params) log.debug('Getting %s (params: %s)' % (self.base_url, params)) r = self.http_session.get(self.base_url, params=req_params) r.raise_for_status() return r.content
def oai_call(self, params={}): """Makes a call to the OAI endpoint and returns the response as a string. :type params: dict :param params: a dictionary sent as arguments in the query string """ # Add the set variable to the parameters (if available) if self.oai_set: params['set'] = self.oai_set # Remove set and metadataPrefix, when a resumptionToken is present if 'resumptionToken' in params: if 'set' in params: del params['set'] if 'metadataPrefix' in params: del params['metadataPrefix'] log.debug('Getting %s (params: %s)' % (self.oai_base_url, params)) r = self.http_session.get(self.oai_base_url, params=params) r.raise_for_status() return r.content
def get_all_records(self): cmcontinue = None while True: req_params = {} if cmcontinue: req_params['cmcontinue'] = cmcontinue # Get the file pages in the specified Wiki category file_pages = etree.fromstring(self.wikimedia_api_call(req_params)) # Request the metadata of each page for file_page in file_pages.findall('.//cm'): page_title = file_page.attrib['title'] page_meta = self.commons_api_call(page_title) page_meta_tree = etree.fromstring(page_meta) # Skip this page if the response contains errors (the Commons # API doesn't return proper HTTP status codes) page_meta_error = page_meta_tree.find('.//error') if page_meta_error: log.warning('Skipping "%s" because of Commons API error: %s' % (page_title, page_meta_error.text)) continue yield 'application/xml', page_meta try: cmcontinue = file_pages.xpath('.//query-continue/categorymembers/@cmcontinue')[0] except IndexError: cmcontinue = None # When cmcontinue is empty or None, we've reached the last page if not cmcontinue: log.debug('cmcontinue empty, done fetching category pages') break
def adlib_search_call(self, params={}): """Makes a call to the Adlib endpoint and returns the response as a string. :type params: dict :param params: a dictonary sent as arguments in the query string :rtype: lxml.etree """ default_params = { 'database': self.adlib_database, 'search': self.adlib_query, 'xmltype': self.adlib_xmltype, 'limit': self.per_page_limit, 'startfrom': 0 } default_params.update(params) log.debug('Getting %s (params: %s)' % (self.adlib_base_url, default_params)) r = self.http_session.get(self.adlib_base_url, params=default_params) r.raise_for_status() return etree.fromstring(r.content)
def test_download_results(self): extractor = ArtsHollandExtractor({ 'url': 'http://api.artsholland.com/sparql' }) for result in extractor.get_all_results(): log.debug ("result %s %s", result[0], result[1]) pass
def test_download_results(self): extractor = ArtsHollandExtractor( {'url': 'http://api.artsholland.com/sparql'}) for result in extractor.get_all_results(): log.debug("result %s %s", result[0], result[1]) pass