def main(): sickle = Sickle('http://arizona.openrepository.com/arizona/oai/request?') # sets = sickle.ListSets() recs = sickle.ListRecords(**{'metadataPrefix':'oai_dc','set':'com_10150_129649','from':'2017-04-05'}) # log.debug("Making request to {}".format(recs)) # try: # response = recs # except Exception as e: # log.exception("An error occured in issuing the request!") # raise # log.debug("Request completed") # # log.debug("Response Code: {}".format(response.status_code)) # # log.debug("Response text: {}".format(response.text)) # log.debug("Trying to convert response to JSON...") # try: # response = response # log.debug("Response successfully converted to JSON: {}".format(response)) # except Exception as e: # log.exception("An error occured!") # raise # print(recs.url) newFile.write('<?xml version="1.0" encoding="utf-8"?>') newFile.write('<OAI-PMH xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:OAI-PMH="http://www.openarchives.org/OAI/2.0/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">') for r in recs: newR = str(r).encode('utf8') newR = newR.decode('utf8') newFile.write(str(newR)) newFile.write('</OAI-PMH>')
def get_record_metadata(repository_url, identifier): sickle = Sickle(repository_url) rec = sickle.GetRecord( identifier=identifier, metadataPrefix='oai_dc' ) return rec.metadata
def download_oai_dc(outfile=None, base_url='http://bibliotheque-numerique.inha.fr/oai', sets=16800, force_refresh=False): """Downloads a set from a oai-pmh repository and returns it if given an outfile, save the resuls to it, will also use it as a cache if needed """ if os.path.exists(outfile) and not force_refresh: return read_json_gzip(outfile) sickle = Sickle(base_url) records = sickle.ListRecords(**{ 'metadataPrefix': 'oai_dc', 'set': "oai:sets:%d" % sets }) records_fetched = list() i = 0 for record in tqdm(records): if i == 100: break records_fetched.append(record.metadata) i += 1 records_fetched = records_fetched if outfile: write_json_gzip(outfile, records_fetched) return records_fetched
def _get_random_configuration(self): self.servers = TestDynamicListRecords._get_servers() test_key = 'TEST' configuration = {'contexts': {test_key: {}}} servers = random.choices(self.servers, k=random.randint(1, min(3, len(self.servers)))) i = 0 for server in servers: server_key = 'SERVER' + str(i) configuration['contexts'][test_key][server_key] = {'url': server} set_names = [] server_sickle = Sickle(server) try: for set_name in server_sickle.ListSets(): if len(set_names) == 20: break set_names.append(set_name.setSpec) except: continue sets = set( random.choices(set_names, k=random.randint(0, min(20, len(set_names))))) if len(sets) > 0: configuration['contexts'][test_key][server_key]['sets'] = [] for set_name in sets: configuration['contexts'][test_key][server_key]['sets'].append( set_name) i += 1 return configuration
def run(self): timestamp = datetime.utcnow() sickle = Sickle('http://invenio.nusl.cz/oai2d/') sickle.class_mapping['ListRecords'] = MarcXMLParser sickle.class_mapping['GetRecord'] = MarcXMLParser oai_logger.info("Loading records") records = sickle.ListRecords(metadataPrefix='marcxml') for idx, record in enumerate(records): print(f"{idx}. {record.id}") oai_logger.info(f"{idx}. {record.id}") try: current_search_client.index( index=self.index, id=record.marc_dict["001"], body=record.marc_dict ) except: exc_traceback = traceback.format_exc() print(exc_traceback) print("\n\n\n") file_name = f'{timestamp.strftime("%Y%m%dT%H%M%S")}.err' file_path = os.path.join(self.path, file_name) with open(file_path, "a") as f: f.write( f"Dictionary: {record.marc_dict}\n\n" f"{exc_traceback}\n\n\n\n") continue
def fetch_oai_recs_day(self, date): """ Generator that returns the key and full record of works deposited on a particular day. :param day: :return: str key, str rec """ api = Sickle(self.endpoint_url) date_str = date.isoformat() # this dict kwargs hack is to work around 'from' as a reserved python keyword # recommended by sickle docs # Question: Why would someone insist on using a keyword for a parameter? try: records = api.ListRecords( **{ 'metadataPrefix': self.metadata_prefix, 'from': date_str, 'until': date_str, }) except oaiexceptions.NoRecordsMatch: records = [] _LOGGER.info('OAI request produced no records.') for item in records: yield item.header.identifier.encode('utf-8'), item.raw.encode( 'utf-8')
def getData(request): """ POST http://localhost/oai_pmh/api/getdata/ POST data query='{"url":"value"}' """ try: serializer = IdentifySerializer(data=request.DATA) if serializer.is_valid(): url = request.POST['url'] if str(url).__contains__('?'): registryURl = str(url).split('?')[0] #Check if the OAI Registry is available sickle = Sickle(registryURl) sickle.Identify() http_response = requests.get(url) if http_response.status_code == status.HTTP_200_OK: return Response(http_response.text, status=status.HTTP_200_OK) else: raise OAIAPIException(message='An error occurred.', status=http_response.status_code) else: raise OAIAPIException(message='An error occurred, url malformed.', status=status.HTTP_400_BAD_REQUEST) else: raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST) except requests.HTTPError, err: content = APIMessage.getMessageLabelled(err.message) return Response(content, status=err.response.status_code)
def __init__(self, provider: OAIProvider, parser_name: str = None, unhandled_paths: set = None, validation: Callable = None, create_record: Callable = None, delete_record: Callable = None, update_record: Callable = None, pid_type: str = None, oai_identifiers: List[str] = None): super().__init__(provider) self.pid_type = pid_type self.provider = provider self.oai_sync = None self.sickle = Sickle(self.provider.oai_endpoint) registry.load() self.parsers = provider.get_parsers() self.rules = provider.get_rules(parser_name) or {} self.parser = self.parsers.get(parser_name) or {} self.transformer = OAITransformer(self.rules, unhandled_paths=unhandled_paths) self.validation_handler = validation self.create_record_handler = create_record self.update_record_handler = update_record self.delete_record_handler = delete_record self.oai_identifiers = oai_identifiers
def listIdentifiers(request): """ POST http://localhost/oai_pmh/api/listidentifiers POST data query='{"url":"value", "metadataprefix":"value"}' optional {"set":"value"} """ try: serializer = RegistryURLSerializer(data=request.DATA) if serializer.is_valid(): url = request.DATA['url'] metadataprefix = request.DATA['metadataprefix'] setH = request.DATA.get('set', None) sickle = Sickle(url) rsp = sickle.ListIdentifiers(metadataPrefix=metadataprefix, set=setH) rtn = [] try: while True: rtn.append( dict(rsp.next()) ) except StopIteration: pass serializer = ListIdentifierSerializer(rtn) return Response(serializer.data, status=status.HTTP_200_OK) else: raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST) except OAIAPIException as e: return e.response() except Exception as e: content = APIMessage.getMessageLabelled('An error occurred when attempting to identify resource: %s'%e.message) return Response(content, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
def getRecord(request): """ POST http://localhost/oai_pmh/api/rest/getrecord POST data query='{"url":"value", "identifier":"value", "metadataprefix":"value"}' """ try: serializer = GetRecordSerializer(data=request.DATA) if serializer.is_valid(): url = request.DATA['url'] identifier = request.DATA['identifier'] metadataprefix = request.DATA['metadataprefix'] sickle = Sickle(url) grResponse = sickle.GetRecord(metadataPrefix=metadataprefix, identifier=identifier) record = Record(grResponse.xml) rtn=[] rtn.append({"identifier": record.header.identifier, "datestamp": record.header.datestamp, "deleted": record.deleted, "sets": record.header.setSpecs, "metadataPrefix": metadataprefix, "metadata": etree.tostring(record.xml.find('.//' + '{http://www.openarchives.org/OAI/2.0/}' + 'metadata/')) if not record.deleted else None, "raw": record.raw}) serializer = RecordSerializer(rtn) return Response(serializer.data, status=status.HTTP_200_OK) else: raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST) except OAIAPIException as e: return e.response() except Exception as e: content = APIMessage.getMessageLabelled('An error occurred when attempting to retrieve record. %s'%e) return Response(content, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
def __init__(self, config, query): oai_api_url = config['default']['oai_api_url'] self.sickle = Sickle(oai_api_url) self.resumption_token = self.get_token() # Queried attribute. I.e. type, description, format, subject, etc. self.query_dict = query
def list_sets_with_counts(repository_url): sickle = Sickle(repository_url) setlist = [] listsets = sickle.ListSets() try: for i in range(500): s = listsets.next() #identifiers = get_identifiers_in_set(s.setSpec) cnt = 'Ha' #len(identifiers) set_identifiers = list_identifiers(s.setSpec) setlist.append( { 'setSpec': s.setSpec, 'setName': s.setName, 'set_identifiers': set_identifiers, } ) except StopIteration: pass except Exception as e: abort(400, e) return setlist
def __init__(self, url_harvest, extra_data, **kwargs): super(OAIFetcher, self).__init__(url_harvest, extra_data, **kwargs) # TODO: check extra_data? self.oai_client = Sickle(self.url) self._metadataPrefix = self.get_metadataPrefix(extra_data) # ensure not cached in module? self.oai_client.class_mapping['ListRecords'] = SickleDCRecord self.oai_client.class_mapping['GetRecord'] = SickleDCRecord if extra_data: # extra data is set spec if 'set' in extra_data: params = parse_qs(extra_data) self._set = params['set'][0] else: self._set = extra_data # if metadataPrefix=didl, use didlRecord for parsing if self._metadataPrefix.lower() == 'didl': self.oai_client.class_mapping['ListRecords'] = SickleDIDLRecord self.oai_client.class_mapping['GetRecord'] = SickleDIDLRecord elif self._metadataPrefix.lower() == 'marcxml': self.oai_client.class_mapping['ListRecords'] = SickleMARCRecord self.oai_client.class_mapping['GetRecord'] = SickleMARCRecord self.records = self.oai_client.ListRecords( metadataPrefix=self._metadataPrefix, set=self._set, ignore_deleted=True) else: self.records = self.oai_client.ListRecords( metadataPrefix=self._metadataPrefix, ignore_deleted=True)
def main(): parser = argparse.ArgumentParser() parser.add_argument('-f', '--from_date', required=True) params = parser.parse_args() logging.basicConfig(level=logging.INFO, format='[%(asctime)s] %(levelname)s %(message)s', datefmt='%d/%b/%Y %H:%M:%S') oai_client = Sickle(endpoint=OAI_PMH_PREPRINT_ENDPOINT, max_retries=3, verify=False) records = oai_client.ListRecords(**{ 'metadataPrefix': OAI_METADATA_PREFIX, 'from': params.from_date }) logging.info('Obtendo dados do OAI-PMH Preprints para date >= %s' % params.from_date) data = {} for r in records: data.update(parse(r)) filename = ''.join([PREPRINT_DICTIONARY_PREFIX, params.from_date, '.json']) save(data, filename)
def get_bitstream_url(collection, record_in): """ Harvests an href pointing to the bitstream urls for the record in repository. E.g., https://scholarspace.manoa.hawaii.edu/bitstream/10125/25006/1/editor.pdf """ sickle = Sickle(collection.community.repository.base_url) sickle.class_mapping['GetRecord'] = LltRecordBitstream record = sickle.GetRecord(metadataPrefix='ore', identifier=record_in.header.identifier) bitstreams = {'bitstream': None, 'bitstream_txt': None} try: bitstreams['bitstream'] = record.metadata['bitstream'] except Exception as e: print(e, 'Unable to construct bitstream url for', record_in.header.identifier) try: bitstreams['bitstream_txt'] = record.metadata['bitstream_txt'][ 0].replace('+', '%20') except Exception as e: print(e, 'Unable to construct bitstream_txt url for', record_in.header.identifier) return bitstreams
def __init__(self, source=None, setspec=None, **kwargs): """Init the loader for remote OAI-PMH access.""" super(RemoteOAIRELoader, self).__init__( source or current_app.config['OPENAIRE_OAIPMH_ENDPOINT'], **kwargs) self.client = Sickle(self.source) self.setspec = setspec or \ current_app.config['OPENAIRE_OAIPMH_DEFAULT_SET'],
def test_override_encoding(self): mock_response = Mock(text='<xml/>') mock_get = Mock(return_value=mock_response) with patch('sickle.app.requests.get', mock_get): sickle = Sickle('url', encoding='encoding') sickle.ListSets() self.assertEqual(mock_response.encoding, 'encoding')
def descriptions(self): """Retrive descriptions and dumps it in cache file""" s = Sickle(self.__url_api__) records = [record for record in s.ListRecords(metadataPrefix='oai_dc')] descr = OrderedDict() subjects = OrderedDict() cats = [] for record in records: item = record.metadata['source'][0].split(',')[1].strip() descr[item] = record.metadata self.__log__.info('%s', item) if 'subject' in record.metadata: item_subjects = record.metadata['subject'] for t in item_subjects: if t in subjects: subjects[t] = subjects[t] + 1 else: subjects[t] = 1 cats.append(self.subject_to_category(t)) else: self.__log__.warning(' no subject for %s', item) self.__log__.info('Parsed %s items', len(records)) self.__log__.info('Subjects: %s', json.dumps(subjects, indent=2)) for cat in cats: self.__log__.info(' [[%s]]', cat) page = self.__site__.pages[cat] if not page.exists: page.save('[[{}]]'.format(self.__category__), 'Upload cat') self.__log__.info('Dumping metadata %s', self.__cache_meta__) with open(self.__cache_meta__, 'w') as f: json.dump(descr, f, indent=4, ensure_ascii=False)
def ssoarharvest(filename='support_data/data_harvest1.json'): sickle = Sickle('https://www.ssoar.info/OAIHandler/request') records = sickle.ListRecords(metadataPrefix='oai_genios') counter = 0 listofcounter = [] for r in records: counter += 1 listofcounter.append(r) if counter % 10000 == 0: print(counter) llt = [] errorls = [] for index, item in enumerate(listofcounter): try: llt.append( eval(json.dumps(xmltodict.parse(etree.tostring(item.xml))))) except: errorls.append(index) a = {} a["result"] = llt with open(filename, 'w') as fp: json.dump(a, fp, indent=4)
def fetch_iter(self): try: sickle = Sickle(self._oaiconfig['OAI']['url']) records_iter = sickle.ListRecords(**self.dic) for record in records_iter: yield record except BadArgument as ba: self._exception_logger.error( "bad argument exception {EXCEPTION}".format(EXCEPTION=str(ba))) except OAIError as oaiError: self._exception_logger.error( "OAIError exception {EXCEPTION}".format( EXCEPTION=str(oaiError))) except NoRecordsMatch as noRecordsmatch: self._summary_logger.error("no records matched {EXCEPTION}".format( EXCEPTION=str(noRecordsmatch))) except Exception as baseException: self._summary_logger.error( "base exception occured - not directly related to OAI {EXCEPTION}" .format(EXCEPTION=str(baseException))) else: print("oai fetching finished successfully")
def _get_database(self, number): """ This object method makes an api call to neliti api and iteratively yields each record entry for processing, keeping track of the total no of records that has been successfully processed. """ sickle = Sickle(self.url) records = sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True) percentage = 0 no_of_records = 0 while 1: record = records.next() if (record): pass else: break data = self._get_record_data(record) if data: self.database.append(data) no_of_records += 1 else: continue if ((no_of_records % 100 == 0) and (no_of_records != 0)): print("Progress : {no_of_records} records Downloaded".format( no_of_records=no_of_records)) if (no_of_records == number): break
def get_records(identifiers, metadata_prefix=None, url=None, name=None, encoding=None): """Harvest specific records from an OAI repo via OAI-PMH identifiers. :param metadata_prefix: The prefix for the metadata return (defaults to 'oai_dc'). :param identifiers: list of unique identifiers for records to be harvested. :param url: The The url to be used to create the endpoint. :param name: The name of the OAIHarvestConfig to use instead of passing specific parameters. :param encoding: Override the encoding returned by the server. ISO-8859-1 if it is not provided by the server. :return: request object, list of harvested records """ if name: url, _metadata_prefix, _, __ = get_info_by_oai_name(name) # In case we provide a prefix, we don't want it to be # overwritten by the one we get from the name variable. if metadata_prefix is None: metadata_prefix = _metadata_prefix elif not url: raise NameOrUrlMissing( "Retry using the parameters -n <name> or -u <url>." ) request = Sickle(url, encoding=encoding) records = [] for identifier in identifiers: arguments = { 'identifier': identifier, 'metadataPrefix': metadata_prefix or "oai_dc" } records.append(request.GetRecord(**arguments)) return request, records
def test_list_oai_collections(self, community): """ Constructs list of tuples of collections (a seconday grouping concept in OAI) "owned" by the given community. Utilizes OAI-PMH verbs: ListIdentifiers and ListSets """ sickle = Sickle(community.repository.base_url) # Retrieve collections associated with community parameter record_headers = sickle.ListIdentifiers(metadataPrefix='oai_dc', set=community.identifier) # Filter record headers to build collection map from the community community_collections = {} for i in record_headers: # Iterate over associated sets looking for collections for j in i.setSpecs: if j[:3] == 'col': community_collections[ j] = None # register collection id in map # Map names to ids in collection map {setSpec: setName} # listsets oai request returns the 'setName' of the collection in metadata... for i in sickle.ListSets(): modstr = 'col' + i.setSpec[ 3:] # Bug in oai? in set results a 'collection' has a prefix of 'com'! if modstr in community_collections: # checks for a mapped collection identifier community_collections[modstr] = i.setName # Convert map to list of tuples self.collections = community_collections.items() # Sort collections by name self.collections = sorted(self.collections, key=lambda i: i[1]) return self.collections
def _oai2d_endpoint_identifiers(self): """Return a set of the Community OAI Set recids from OAI endpoint.""" with patch('sickle.app.requests.get', new=sickle_requests_get_mock()): sickle = Sickle('http://auditor/oai2d') ids = sickle.ListIdentifiers(set=self.community.oaiset_spec, metadataPrefix='oai_dc') return {int(i.identifier.rsplit(':', 1)[-1]) for i in ids}
def __init__(self, name, provider_code, oai_endpoint, metadata_prefix, set_, constant_fields: dict = None, parser: Callable = None, transformer=None, endpoints=None, default_endpoint: str = "recid", endpoint_mapping=None, pid_field=None, from_: str = None, endpoint_handler: dict = None, bulk: bool = True, pre_processors: dict = None, post_processors: dict = None, index: str = None): # Counters self.only_fetch = False self.deleted = 0 self.created = 0 self.modified = 0 if endpoint_mapping is None: # pragma: no cover endpoint_mapping = {} if pid_field is None: self.pid_field = current_app.config.get('PIDSTORE_RECID_FIELD', "recid") else: # pragma: no cover self.pid_field = pid_field self.name = name self.provider_code = provider_code self.metadata_prefix = metadata_prefix self.oai_endpoint = oai_endpoint self.oai_sync = None self.sickle = Sickle(self.oai_endpoint) self.parser = parser self.transformer = transformer self.endpoints = endpoints self.default_endpoint = default_endpoint self.endpoint_mapping = endpoint_mapping self.set_ = set_ if constant_fields: self.constant_fields = constant_fields else: self.constant_fields = {} self._from = None if from_: self.from_ = from_ self.endpoint_handler = endpoint_handler self.bulk = bulk self.pre_processors = pre_processors self.post_processors = post_processors self.overwrite = False self.es_client = current_search_client self._index = index
def test_override_encoding(self): mock_response = Mock(text='<xml/>', content='<xml/>', status_code=200) mock_get = Mock(return_value=mock_response) with patch.object(Session, 'get', mock_get): sickle = Sickle('url', encoding='encoding') sickle.ListSets() mock_get.assert_called_once_with('url', params={'verb': 'ListSets'})
def harvest_oai_collection_records_sickle(self, collection): sickle = Sickle(collection.community.repository.base_url) sickle.class_mapping['ListRecords'] = LltRecord sickle.class_mapping['GetRecord'] = LltRecord records = sickle.ListRecords(metadataPrefix='dim', ignore_deleted=True, set=collection.identifier) return records
def _fetch_records(endpoint, count): subset = [] sickle = Sickle(endpoint) records = sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True) for idx, rec in enumerate(records): if idx == count: break subset.append(rec.metadata) return subset
def harvest_oai(**kwargs): """Create OAI ListRecords Iterator for Harvesting Data.""" oai_endpoint = kwargs.get("oai_endpoint") harvest_params = kwargs.get("harvest_params") logging.info("Harvesting from %s", oai_endpoint) logging.info("Harvesting %s", harvest_params) request = Sickle(oai_endpoint, retry_status_codes=[500, 503]) data = request.ListRecords(**harvest_params) return data
def parse_single(self, response): sickle = Sickle(self.url) params = { 'metadataPrefix': self.format, 'identifier': response.meta['identifier'], } record = sickle.GetRecord(**params) self._crawled_records[params['identifier']] = record response = XmlResponse(self.url, encoding='utf-8', body=record.raw) selector = Selector(response, type='xml') return self.parse_record(selector)
class TestCase(unittest.TestCase): def setUp(self): mock.patch('sickle.app.Sickle.harvest', fake_harvest).start() self.sickle = Sickle('fake_url') def test_OAIResponse(self): response = self.sickle.harvest(verb='ListRecords', metadataPrefix='oai_dc') response.xml response.raw def test_broken_XML(self): response = self.sickle.harvest( verb='ListRecords', resumptionToken='ListRecordsBroken.xml') response.xml response.raw def test_ListRecords(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc') assert len([r for r in records]) == 8 def test_ListRecords_ignore_deleted(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True) # There are twelve deleted records in the test data num_records = len([r for r in records]) assert num_records == 4 def test_ListSets(self): sets = self.sickle.ListSets() num_sets = len([s for s in sets]) assert num_sets == 131 dict(s) def test_ListMetadataFormats(self): mdfs = self.sickle.ListMetadataFormats() num_mdfs = len([mdf for mdf in mdfs]) assert num_mdfs == 5 dict(mdf) def test_ListIdentifiers(self): records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc') assert len([r for r in records]) == 4 def test_ListIdentifiers_ignore_deleted(self): records = self.sickle.ListIdentifiers( metadataPrefix='oai_dc', ignore_deleted=True) # There are 2 deleted headers in the test data num_records = len([r for r in records]) assert num_records == 2 def test_Identify(self): identify = self.sickle.Identify() assert hasattr(identify, 'repositoryName') assert hasattr(identify, 'baseURL') assert hasattr(identify, 'adminEmail') assert hasattr(identify, 'earliestDatestamp') assert hasattr(identify, 'deletedRecord') assert hasattr(identify, 'granularity') assert hasattr(identify, 'description') assert hasattr(identify, 'oai_identifier') assert hasattr(identify, 'sampleIdentifier') dict(identify) def test_GetRecord(self): oai_id = 'oai:test.example.com:1996652' record = self.sickle.GetRecord(identifier=oai_id) assert record.header.identifier == oai_id assert oai_id in record.raw record.xml str(record) unicode(record) dict(record.header) dict(record.origin) assert dict(record) == record.metadata # Test OAI-specific exceptions @raises(BadArgument) def test_badArgument(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument') @raises(CannotDisseminateFormat) def test_cannotDisseminateFormat(self): records = self.sickle.ListRecords( metadataPrefix='oai_dc', error='cannotDisseminateFormat') @raises(IdDoesNotExist) def test_idDoesNotExist(self): records = self.sickle.GetRecord( metadataPrefix='oai_dc', error='idDoesNotExist') @raises(NoSetHierarchy) def test_idDoesNotExist(self): records = self.sickle.ListSets( metadataPrefix='oai_dc', error='noSetHierarchy') @raises(BadResumptionToken) def test_badResumptionToken(self): records = self.sickle.ListRecords( metadataPrefix='oai_dc', error='badResumptionToken') @raises(NoRecordsMatch) def test_noRecordsMatch(self): records = self.sickle.ListRecords( metadataPrefix='oai_dc', error='noRecordsMatch') @raises(OAIError) def test_undefined_OAI_error_XML(self): records = self.sickle.ListRecords( metadataPrefix='oai_dc', error='undefinedError') @mock.patch('sickle.app.Sickle.harvest', fake_harvest) def test_OAIResponseIterator(self): sickle = Sickle('fake_url', rtype='response') records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')] assert len(records) == 4
def setUp(self): mock.patch('sickle.app.Sickle.harvest', fake_harvest).start() self.sickle = Sickle('http://localhost')
def setUp(self): self.patch.start() self.sickle = Sickle('http://localhost')
class TestCase(unittest.TestCase): def __init__(self, methodName='runTest'): super(TestCase, self).__init__(methodName) self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest) def setUp(self): self.patch.start() self.sickle = Sickle('http://localhost') def tearDown(self): self.patch.stop() def test_OAIResponse(self): response = self.sickle.harvest(verb='ListRecords', metadataPrefix='oai_dc') self.assertIsInstance(response.xml, etree._Element) self.assertIsInstance(response.raw, string_types) def test_broken_XML(self): response = self.sickle.harvest( verb='ListRecords', resumptionToken='ListRecordsBroken.xml') self.assertEqual(response.xml, None) self.assertIsInstance(response.raw, string_types) def test_ListRecords(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc') assert len([r for r in records]) == 8 def test_ListRecords_ignore_deleted(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True) num_records = len([r for r in records]) assert num_records == 4 def test_ListSets(self): set_iterator = self.sickle.ListSets() sets = [s for s in set_iterator] self.assertEqual(131, len(sets)) dict(sets[0]) def test_ListMetadataFormats(self): mdf_iterator = self.sickle.ListMetadataFormats() mdfs = [mdf for mdf in mdf_iterator] self.assertEqual(5, len(mdfs)) dict(mdfs[0]) def test_ListIdentifiers(self): records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc') self.assertEqual(len([r for r in records]), 4) def test_ListIdentifiers_ignore_deleted(self): records = self.sickle.ListIdentifiers( metadataPrefix='oai_dc', ignore_deleted=True) # There are 2 deleted headers in the test data num_records = len([r for r in records]) self.assertEqual(num_records, 2) def test_Identify(self): identify = self.sickle.Identify() self.assertTrue(hasattr(identify, 'repositoryName')) self.assertTrue(hasattr(identify, 'baseURL')) self.assertTrue(hasattr(identify, 'adminEmail')) self.assertTrue(hasattr(identify, 'earliestDatestamp')) self.assertTrue(hasattr(identify, 'deletedRecord')) self.assertTrue(hasattr(identify, 'granularity')) self.assertTrue(hasattr(identify, 'description')) self.assertTrue(hasattr(identify, 'oai_identifier')) self.assertTrue(hasattr(identify, 'sampleIdentifier')) dict(identify) def test_GetRecord(self): oai_id = 'oai:test.example.com:1996652' record = self.sickle.GetRecord(identifier=oai_id) self.assertEqual(record.header.identifier, oai_id) self.assertIn(oai_id, record.raw) self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z') self.assertIsInstance(record.xml, etree._Element) binary_type(record) text_type(record) dict(record.header) self.assertEqual(dict(record), record.metadata) # Test OAI-specific exceptions @raises(BadArgument) def test_badArgument(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument') @raises(CannotDisseminateFormat) def test_cannotDisseminateFormat(self): self.sickle.ListRecords( metadataPrefix='oai_dc', error='cannotDisseminateFormat') @raises(IdDoesNotExist) def test_idDoesNotExist(self): self.sickle.GetRecord( metadataPrefix='oai_dc', error='idDoesNotExist') @raises(NoSetHierarchy) def test_noSetHierarchy(self): self.sickle.ListSets( metadataPrefix='oai_dc', error='noSetHierarchy') @raises(BadResumptionToken) def test_badResumptionToken(self): self.sickle.ListRecords( metadataPrefix='oai_dc', error='badResumptionToken') @raises(NoRecordsMatch) def test_noRecordsMatch(self): self.sickle.ListRecords( metadataPrefix='oai_dc', error='noRecordsMatch') @raises(OAIError) def test_undefined_OAI_error_XML(self): self.sickle.ListRecords( metadataPrefix='oai_dc', error='undefinedError') def test_OAIResponseIterator(self): sickle = Sickle('fake_url', iterator=OAIResponseIterator) records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')] self.assertEqual(len(records), 4)
def setUp(self): mock.patch('sickle.app.Sickle.harvest', fake_harvest).start() self.sickle = Sickle('fake_url')