def generate(self): """Returns a list of ResourceSync resources that each represent one full OAI-PMH record (i.e., the result of a GetRecord request). """ provider = Sickle(self.params['oaipmh_base_url']) headers = provider.ListIdentifiers( ignore_deleted=True, set=self.params['oaipmh_set'], metadataPrefix=self.params['oaipmh_metadataprefix']) return list(map(self.oaipmh_header_to_resourcesync_resource, headers))
def test_no_retry(self): mock_response = Mock(status_code=503, headers={'retry-after': '10'}, raise_for_status=Mock(side_effect=HTTPError)) mock_get = Mock(return_value=mock_response) with patch.object(Session, 'get', mock_get): sickle = Sickle('url') try: sickle.ListRecords() except HTTPError: pass self.assertEqual(1, mock_get.call_count)
def list_records(): sik = Sickle(URL_PREPRINTS_OAI, verify=False) records = sik.ListRecords( **{ 'metadataPrefix': 'oai_dc', 'from': '2021-04-01', 'until': '2021-04-07', 'set': 'scielo' }) for r in records: doc = doc_raw_attrs(r) print(doc)
def get_xml_1(oai_identifier): sickle = Sickle("https://dspace.cuni.cz/oai/nusl") record = sickle.GetRecord(metadataPrefix="xoai", identifier=oai_identifier) file_directory = Path(__file__).parent target_directory = file_directory / ".." / "tests" / "data" oai_identifier_array = oai_identifier.split(":") oai_identifier_fixed = oai_identifier_array[-1] oai_identifier_fixed = oai_identifier_fixed.replace(".", "_") oai_identifier_fixed = oai_identifier_fixed.replace("/", "-") filename = str(target_directory / f"{oai_identifier_fixed}.xml") with open(filename, "w+") as f: f.write(record.raw) print(filename, "created")
def __init__(self, source_url: str, format: str, set: str = None) -> None: self.source_url = source_url self.format = format self.set = set self.ids: Optional[Iterator] = None client = Sickle(self.source_url) self.client = client session = requests.Session() self.session = session self.namespace = { "oai": "http://www.openarchives.org/OAI/2.0/", "dim": "http://www.dspace.org/xmlns/dspace/dim", }
def test_pass_request_args(self): mock_response = Mock(text='<xml/>') mock_get = Mock(return_value=mock_response) with patch('sickle.app.requests.get', mock_get): sickle = Sickle('url', timeout=10, proxies=dict(), auth=('user', 'password')) sickle.ListRecords() mock_get.assert_called_once_with('url', params={'verb': 'ListRecords'}, timeout=10, proxies=dict(), auth=('user', 'password'))
def test_pass_request_args(self): mock_response = Mock(text=u'<xml/>', content='<xml/>', status_code=200) mock_get = Mock(return_value=mock_response) with patch.object(Session, 'get', mock_get): sickle = Sickle('url', timeout=10, proxies=dict(), auth=('user', 'password')) sickle.ListRecords() mock_get.assert_called_once_with('url', params={'verb': 'ListRecords'}, timeout=10, proxies=dict(), auth=('user', 'password'))
def client_for_repository(repository): """ Return a sickle client object pre-configured for the passed repository. """ # Extra arguments to the Sickle constructor client_args = {} # If there is a basic auth configuration, add it to the client args if repository.basic_auth_user != '': client_args['auth'] = (repository.basic_auth_user, repository.basic_auth_password) # Construct client object return Sickle(repository.url, **client_args)
def get_oai_pmh_metadata(self, base_url: str) -> Dict[str, str]: """Returns a dictionary containing top-level metadata and set metadata of an OAI-PMH repository.""" logging.debug( 'Retrieving repository and set metadata from OAI-PMH repository %s', base_url) try: metadata = {} # All repositories should have this metadata. repository_metadata = Sickle(base_url, timeout=60).Identify() if hasattr(repository_metadata, 'repositoryIdentifier'): metadata[ 'repository_identifier'] = repository_metadata.repositoryIdentifier if hasattr(repository_metadata, 'repositoryName'): metadata[ 'repository_name'] = repository_metadata.repositoryName # Not all repositories will support sets. try: set_metadata = Sickle(base_url, timeout=60).ListSets() metadata.update({ 'sets': {s.setSpec: s.setName for s in list(set_metadata)} }) except sickle.oaiexceptions.NoSetHierarchy as e: logging.debug( 'Failed to list sets from OAI-PMH repository %s: %s', base_url, e) return metadata except requests.RequestException as e: raise IndexerError( 'Failed to get repository metadata from OAI-PMH repository {}: {}' .format(base_url, e))
def test_retry_on_custom_code(self): mock_response = Mock(status_code=500, raise_for_status=Mock(side_effect=HTTPError)) mock_get = Mock(return_value=mock_response) with patch.object(Session, 'get', mock_get): sickle = Sickle('url', max_retries=3, default_retry_after=0, retry_status_codes=(503, 500)) try: sickle.ListRecords() except HTTPError: pass mock_get.assert_called_with('url', params={'verb': 'ListRecords'}) self.assertEqual(4, mock_get.call_count)
def get_events(self, **kwargs): LOG.debug(f"Executing {PORTAL_NAME} get events") if not self.users: LOG.debug("no users. exiting.") return False records_url = self.portal.get("event_urls", {}).get("oai_pmh_url") last_run = datetime.now() most_recent_datetime = self.get_most_recent_date(self.users) if most_recent_datetime: LOG.debug("start date value found in tracker state db entry.") from_datetime_str = most_recent_datetime.strftime( "%Y-%m-%dT%H:%M:%SZ") from_datetime = most_recent_datetime LOG.debug("earliest date allowed: {}".format(from_datetime_str)) else: until = tracker_app.app.config.get("DISALLOW_EVENTS_BEFORE") if until: from_datetime = datetime.strptime(until, "%Y-%m-%dT%H:%M:%SZ") from_datetime_str = from_datetime.strftime( "%Y-%m-%dT%H:%M:%SZ") else: from_datetime = datetime.now() - timedelta(days=1) from_datetime_str = from_datetime.strftime( "%Y-%m-%dT%H:%M:%SZ") LOG.debug("searching oai-pmh interface: %s" % records_url) try: sickle = Sickle(records_url) records = sickle.ListRecords(**{ 'metadataPrefix': 'oai_dc', 'from': from_datetime_str }) if records.oai_response.http_response.status_code != 200: LOG.debug("non-200 response code received. " "updating tracker status and exiting.") self.complete_tracker( records.oai_response.http_response.status_code) return False except oaiexceptions.NoRecordsMatch: LOG.debug("end of records in oai-pmh response") self.complete_tracker( records.oai_response.http_response.status_code) return False self.parse_records(records, from_datetime, last_run)
def __iter__(self): """Return a new class:`~polymatheia.data.NavigableDictIterator` as the iterator. If ``max_records`` is set, then the class:`~polymatheia.data.NavigableDictIterator` is wrapped in a class:`~polymatheia.data.LimitingIterator`. """ it = NavigableDictIterator( Sickle(self._url).ListRecords(metadataPrefix=self._metadata_prefix, set=self._set_spec, ignore_deleted=True), mapper=lambda record: xml_to_navigable_dict( etree.fromstring( record.raw, parser=etree.XMLParser(remove_comments=True)))) if self._max_records is not None: it = LimitingIterator(it, self._max_records) return it
def harvest(host, from_date, until, format, out, set, verbose): counter = 0 if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.info("OAI-PMH harvesting from %s", host) logging.info("From date = %s", from_date) logging.info("Until date = %s", until) logging.info("Metadata format = %s", format) logging.info("Outfile = %s", out) mysickle = Sickle(host, iterator=OAIItemIterator) params = {'metadataPrefix': format, 'from': from_date, 'until': until} if set is not None: params['set'] = set try: responses = mysickle.ListIdentifiers(**params) except NoRecordsMatch: logging.info("No records harvested: the combination of the values of " "the arguments results in an empty list.") sys.exit() identifier_list = [] for records in responses: identifier_list.append(records.identifier) logging.info(f"Identifier count to harvest: {len(identifier_list)}") with open(out, 'wb') as f: f.write('<records>'.encode()) for identifier in identifier_list: r = mysickle.GetRecord(identifier=identifier, metadataPrefix=format) f.write(r.raw.encode('utf8')) logging.debug(counter) logging.debug(r.raw) counter += 1 f.write('</records>'.encode()) logging.info("Total records harvested: %i", counter)
def main(): # inputs sleep_ct = 900 # number of records until seconds sleep_time = 30 # secs base_url = 'http://export.arxiv.org/oai2' fname_prefix = "./raw_data/arXiv_oai_dc_" fname_log = "./raw_data/harvest.log" # create sickle sickle = Sickle(base_url) # get list of setSpecs ls_setSpec = get_ls_setSpec(sickle) ct_sets = len(ls_setSpec) # read log file to get last harvest date dt_last_harvest = get_dt_last_harvest(fname_log) # append records ct_records = 0 for setSpec in ls_setSpec: print setSpec # get data file fname_data = fname_prefix + setSpec.replace(":", "_") + ".oai" f_data = open(fname_data, 'a') # append records records = sickle.ListRecords(**{ "metadataPrefix": "oai_dc", "set": setSpec, "from": dt_last_harvest }) for record in records: ct_records += 1 f_data.write(str(record.metadata) + '\n') f_data.write(str(record.header) + '\n') if ct_records % sleep_ct == 0: print "sleep for %d secs" % (sleep_time) time.sleep(sleep_time) f_data.close() # log harvest logger = csv.writer(open(fname_log, 'a')) dt_prev = dt_last_harvest dt_curr = datetime.datetime.today().date() - relativedelta(days=1) logger.writerow([dt_curr, dt_prev, ct_sets, ct_records])
def list_set_records(setSpec): set_recs = [] sickle = Sickle(admin.get_repository_url()) try: recs = sickle.ListRecords(metadataPrefix='oai_dc', set=setSpec) for rec in recs: #rec = recs.next() set_recs.append({ "identifier": rec.header.identifier, "datestamp": rec.header.datestamp, "setSpec": rec.header.setSpecs, "dc": rec.metadata, }) except Exception as e: pass #return [rec_type, rec.metadata, rec.header.identifier, rec.header.setSpecs, rec.header.datestamp, rec.header.deleted, rec.raw] return set_recs
def run(self): """ Run the process for update Pre-prints in Solr. """ if self.args.delete: self.solr.delete(self.args.delete, commit=True) else: print("Indexing in {0}".format(self.solr.url)) sickle = Sickle(self.args.oai_url, verify=False) filters = {'metadataPrefix': 'oai_dc'} if self.args.time: filters['from'] = self.from_date.strftime("%Y-%m-%dT%H:%M:%SZ") try: records = sickle.ListRecords(**filters) except NoRecordsMatch as e: print(e) sys.exit(0) else: for i, record in enumerate(records): try: xml = self.pipeline_to_xml(record.xml) print("Indexing record %s with oai id: %s" % (i, record.header.identifier)) self.solr.update(xml, commit=True) except ValueError as e: print("ValueError: {0}".format(e)) print(e) continue except Exception as e: print("Error: {0}".format(e)) print(e) continue # optimize the index self.solr.commit() self.solr.optimize()
def test_retry_on_503(self): mock_response = Mock(status_code=503, headers={'retry-after': '10'}, raise_for_status=Mock(side_effect=HTTPError)) mock_get = Mock(return_value=mock_response) sleep_mock = Mock() with patch('time.sleep', sleep_mock): with patch.object(Session, 'get', mock_get): sickle = Sickle('url', max_retries=3, default_retry_after=0) try: sickle.ListRecords() except HTTPError: pass mock_get.assert_called_with('url', params={'verb': 'ListRecords'}) self.assertEqual(4, mock_get.call_count) self.assertEqual(3, sleep_mock.call_count) sleep_mock.assert_called_with(10)
def list_oai_community_sets(self, repository): """ Contructs list of tuples of communities (a grouping concept in OAI) for the given repository. Utilizes OAI-PMH verb: ListSets """ try: sickle = Sickle(repository.base_url) sets = sickle.ListSets() except: return """ Filter set list to build list of community sets """ for i in sets: """ Build community tuples (id, human readable name) """ if i.setSpec[:3] == 'com': set_data = (i.setSpec, i.setName) self.communities.append(set_data) self.communities = sorted(self.communities, key=lambda i: i[1])
def main(): if collection == 't': collectionID = '129651' if collection == 'd': collectionID = '129652' sickle = Sickle('http://arizona.openrepository.com/arizona/oai/request?') # sets = sickle.ListSets() recs = sickle.ListRecords( **{ 'metadataPrefix': 'oai_dc', 'set': 'col_10150_' + collectionID, 'from': date, 'until': date_until }) # log.debug("Making request to {}".format(recs)) # try: # response = recs # except Exception as e: # log.exception("An error occured in issuing the request!") # raise # log.debug("Request completed") # # log.debug("Response Code: {}".format(response.status_code)) # # log.debug("Response text: {}".format(response.text)) # log.debug("Trying to convert response to JSON...") # try: # response = response # log.debug("Response successfully converted to JSON: {}".format(response)) # except Exception as e: # log.exception("An error occured!") # raise # print(recs.url) newFile.write('<?xml version="1.0" encoding="utf-8"?>') newFile.write( '<OAI-PMH xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:OAI-PMH="http://www.openarchives.org/OAI/2.0/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">' ) for r in recs: newR = str(r) # newR = newR.decode('utf8') newFile.write(str(newR)) newFile.write('</OAI-PMH>')
def parse_list(self, response): sickle = Sickle(self.url) params = { 'metadataPrefix': self.format, 'set': response.meta['set'], 'from': response.meta['from_date'], 'until': self.until_date, } try: records = sickle.ListRecords(**params) except NoRecordsMatch as err: LOGGER.warning(err) raise StopIteration() # Avoid timing out the resumption token # TODO: implemente a storage-based solution, to be able to handle large # amounts of records. records = list(records) LOGGER.info( 'Harvested %s record for params %s', len(records), params, ) for record in records: rec_identifier = self.get_record_identifier(record) if rec_identifier in self._crawled_records: # avoid cross-set repeated records LOGGER.info('Skipping duplicated record %s', rec_identifier) continue LOGGER.debug( 'Not skipping non-duplicated record %s', rec_identifier, ) self._crawled_records[rec_identifier] = record response = XmlResponse(self.url, encoding='utf-8', body=record.raw) selector = Selector(response, type='xml') try: yield self.parse_record(selector) except Exception as err: LOGGER.error(err)
def get_bitstream_url(collection, record_in): """ Harvests an href pointing to the bitstream urls for the record in repository. E.g., https://scholarspace.manoa.hawaii.edu/bitstream/10125/25006/1/editor.pdf """ sickle = Sickle(collection.community.repository.base_url) sickle.class_mapping['GetRecord'] = LltRecordBitstream record = sickle.GetRecord(metadataPrefix='ore', identifier=record_in.header.identifier) bitstreams = {'bitstream': None, 'bitstream_txt': None} try: bitstreams['bitstream'] = record.metadata['bitstream'][0].replace('+', '%20') except Exception as e: print (e, 'Unable to construct bitstream url for', record_in.header.identifier) try: bitstreams['bitstream_txt'] = record.metadata['bitstream_txt'][0].replace('+', '%20') except Exception as e: print (e, 'Unable to construct bitstream_txt url for', record_in.header.identifier) return bitstreams
def list_sets(repository_url=None): repository_url = repository_url or admin.get_repository_url() sickle = Sickle(repository_url) setlist = [] listsets = sickle.ListSets() try: for i in range(500): s = listsets.next() setlist.append( { 'setSpec': s.setSpec, 'setName': s.setName, } ) except StopIteration: pass except Exception as e: abort(400, e) return setlist
def get_direct_records(context, params): records = [] i = 0 root = OAIBridge.data["contexts"][context] for name in root: sickle = Sickle(root[name]['url']) sets = root[name]['sets'] if 'sets' in root[name] else None if not sets: try: for record in sickle.ListRecords(**params): i += 1 if not record.deleted: records.append(record.metadata) except NoRecordsMatch: pass except: print(traceback.format_exc()) break else: unknown_error = False for set_name in sets: new_params = dict(params) new_params['set'] = set_name try: for record in sickle.ListRecords(**new_params): i += 1 if not record.deleted: records.append(record.metadata) except NoRecordsMatch: pass except: print(traceback.format_exc()) unknown_error = True break if unknown_error: break return i, records
def run(self): """ Run the process for update Pre-prints in Solr. """ if self.args.delete: self.solr.delete(self.args.delete, commit=True) else: logger.info("Indexing in {0}".format(self.solr.url)) sickle = Sickle(self.args.oai_url) records = sickle.ListRecords( **{ 'metadataPrefix': 'oai_dc', 'from': self.from_date.strftime("%Y-%m-%dT%H:%M:%SZ") }) for record in records: try: xml = self.pipeline_to_xml(record.xml) self.solr.update(xml, commit=True) except ValueError as e: logger.error("ValueError: {0}".format(e)) logger.exception(e) continue except Exception as e: logger.error("Error: {0}".format(e)) logger.exception(e) continue # optimize the index self.solr.commit() self.solr.optimize()
def run(self, update_all=False, override=False): arxiv = Sickle('http://export.arxiv.org/oai2') # date = datetime.date(2014, 5, 14) # records = arxiv.ListRecords(**{'metadataPrefix': 'arXiv', 'from': str(date)}) # print str(datetime.date(2014, 5, 14)) last_update = Synchronization.query.order_by( Synchronization.id.desc()).first() if (datetime.datetime.utcnow() - last_update.date).days < 1: return 0 if last_update is None or update_all: date = None records = arxiv.ListRecords(metadataPrefix='arXiv') else: date = last_update.date.date() records = arxiv.ListRecords(**{ 'metadataPrefix': 'arXiv', 'from': str(date) }) count = 0 badrecords = [] for r in records: count += 1 if count % 1000 == 0: print count try: a = self.add_article(r.metadata) except Exception as e: badrecords.append(r) print "Exception: ", e # print a.title db.session.commit() db.session.add(Synchronization(date=datetime.datetime.now())) db.session.commit() print "all done!" return count
def oai_get_record(id, name, transformation, record_cls, access_token=None, identifier=None, dbcommit=False, reindex=False, test_md5=False, verbose=False, debug=False, **kwargs): """Get record from an OAI repo. :param identifier: identifier of record. """ url, metadata_prefix, lastrun, setspecs = get_info_by_oai_name(name) request = Sickle(url) params = {} if access_token: params['accessToken'] = access_token params['metadataPrefix'] = metadata_prefix params['identifier'] = f'{identifier}{id}' try: record = request.GetRecord(**params) except Exception as err: if debug: raise Exception(err) return None records = parse_xml_to_array(StringIO(record.raw)) trans_record = transformation(records[0]).json if verbose: click.echo(f'OAI-{name} get: {id}') return trans_record
def __init__(self, endpoint='http://export.arxiv.org/oai2', metadataPrefix='oai_dc', harvest_set='cs', recsFrom=str(date.today() - relativedelta(days=1)), recsUntil=''): self.endpoint = endpoint self.metadataPrefix = metadataPrefix self.harvest_set = harvest_set self.recsFrom = recsFrom if not recsUntil: self.recsUntil = date.today() else: self.recsUntil = recsUntil #self.recsUntil = recsUntil self.responses = None self.recs = [] self.idfiers = [] self.descriptions = [] self.sickle = Sickle(endpoint, iterator=OAIResponseIterator)
print("[" + now + "]\t" + text) # forces to output the result of the print command immediately, see: http://stackoverflow.com/questions/230751/how-to-flush-output-of-python-print sys.stdout.flush() runningFromWithinStabi = False # main PPN harvesting savedRecords = [] if runningFromWithinStabi: proxy = urllib.request.ProxyHandler({}) opener = urllib.request.build_opener(proxy) urllib.request.install_opener(opener) # create OAI-PMH reader pointing to the Stabi OAI-PMH endpoint of the digitzed collections sickle = Sickle('http://digital.staatsbibliothek-berlin.de/oai') records = sickle.ListRecords(metadataPrefix='oai_dc', set='DC_all') if True: printLog("Starting OAI-PMH record download...") # initialize some variables for counting and saving the metadata records savedDocs = 0 maxDocs = 146000 # 100 is just for testing, for more interesting results increase this value to 1000. ATTENTION! this will also take more time for reading data. # save the records locally as we don't want to have to rely on a connection to the OAI-PMH server all the time # iterate over all records until maxDocs is reached # ATTENTION! if you re-run this cell, the contents of the savedRecords array will be altered! for record in records: # check if we reach the maximum document value if savedDocs < maxDocs:
#!/usr/bin/python3 from sickle import Sickle from pymarc import Record, Field, MARCWriter import os import re # configurations save_file = 'c:\\users\\user\\desktop\\books.dat' # delete old file (if exists) os.system(f'del {save_file}') # load OAI-PMH client # documentation: https://sickle.readthedocs.io/en/latest/ sickle = Sickle('http://content.cdlib.org/oai') records = sickle.ListRecords(metadataPrefix='oai_dc', set='YOUR_SET_ID_HERE') # parse harvested records and generate MARC21 for record in records: # parse dc record dc = record.metadata contributors = dc.get("contributor", []) coverages = dc.get("coverage", []) creators = dc.get("creator", []) dates = dc.get("date", []) descriptions = dc.get("description", []) formats = dc.get("format", []) identifiers = dc.get("identifier", []) languages = dc.get("language", []) publishers = dc.get("publisher", []) relations = dc.get("relation", [])
def test_invalid_iterator(self): Sickle("http://localhost", iterator=None)