def getRecord(request): """ POST http://localhost/oai_pmh/api/rest/getrecord POST data query='{"url":"value", "identifier":"value", "metadataprefix":"value"}' """ try: serializer = GetRecordSerializer(data=request.DATA) if serializer.is_valid(): url = request.DATA['url'] identifier = request.DATA['identifier'] metadataprefix = request.DATA['metadataprefix'] sickle = Sickle(url) grResponse = sickle.GetRecord(metadataPrefix=metadataprefix, identifier=identifier) record = Record(grResponse.xml) rtn=[] rtn.append({"identifier": record.header.identifier, "datestamp": record.header.datestamp, "deleted": record.deleted, "sets": record.header.setSpecs, "metadataPrefix": metadataprefix, "metadata": etree.tostring(record.xml.find('.//' + '{http://www.openarchives.org/OAI/2.0/}' + 'metadata/')) if not record.deleted else None, "raw": record.raw}) serializer = RecordSerializer(rtn) return Response(serializer.data, status=status.HTTP_200_OK) else: raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST) except OAIAPIException as e: return e.response() except Exception as e: content = APIMessage.getMessageLabelled('An error occurred when attempting to retrieve record. %s'%e) return Response(content, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
def get_bitstream_url(collection, record_in): """ Harvests an href pointing to the bitstream urls for the record in repository. E.g., https://scholarspace.manoa.hawaii.edu/bitstream/10125/25006/1/editor.pdf """ sickle = Sickle(collection.community.repository.base_url) sickle.class_mapping['GetRecord'] = LltRecordBitstream record = sickle.GetRecord(metadataPrefix='ore', identifier=record_in.header.identifier) bitstreams = {'bitstream': None, 'bitstream_txt': None} try: bitstreams['bitstream'] = record.metadata['bitstream'] except Exception as e: print(e, 'Unable to construct bitstream url for', record_in.header.identifier) try: bitstreams['bitstream_txt'] = record.metadata['bitstream_txt'][ 0].replace('+', '%20') except Exception as e: print(e, 'Unable to construct bitstream_txt url for', record_in.header.identifier) return bitstreams
def get_records(identifiers, metadata_prefix=None, url=None, name=None, encoding=None): """Harvest specific records from an OAI repo via OAI-PMH identifiers. :param metadata_prefix: The prefix for the metadata return (defaults to 'oai_dc'). :param identifiers: list of unique identifiers for records to be harvested. :param url: The The url to be used to create the endpoint. :param name: The name of the OAIHarvestConfig to use instead of passing specific parameters. :param encoding: Override the encoding returned by the server. ISO-8859-1 if it is not provided by the server. :return: request object, list of harvested records """ if name: url, _metadata_prefix, _, __ = get_info_by_oai_name(name) # In case we provide a prefix, we don't want it to be # overwritten by the one we get from the name variable. if metadata_prefix is None: metadata_prefix = _metadata_prefix elif not url: raise NameOrUrlMissing( "Retry using the parameters -n <name> or -u <url>." ) request = Sickle(url, encoding=encoding) records = [] for identifier in identifiers: arguments = { 'identifier': identifier, 'metadataPrefix': metadata_prefix or "oai_dc" } records.append(request.GetRecord(**arguments)) return request, records
def get_record_metadata(repository_url, identifier): sickle = Sickle(repository_url) rec = sickle.GetRecord( identifier=identifier, metadataPrefix='oai_dc' ) return rec.metadata
def parse_single(self, response): sickle = Sickle(self.url) params = { 'metadataPrefix': self.format, 'identifier': response.meta['identifier'], } record = sickle.GetRecord(**params) self._crawled_records[params['identifier']] = record response = XmlResponse(self.url, encoding='utf-8', body=record.raw) selector = Selector(response, type='xml') return self.parse_record(selector)
def get_xml_1(oai_identifier): sickle = Sickle("https://dspace.cuni.cz/oai/nusl") record = sickle.GetRecord(metadataPrefix="xoai", identifier=oai_identifier) file_directory = Path(__file__).parent target_directory = file_directory / ".." / "tests" / "data" oai_identifier_array = oai_identifier.split(":") oai_identifier_fixed = oai_identifier_array[-1] oai_identifier_fixed = oai_identifier_fixed.replace(".", "_") oai_identifier_fixed = oai_identifier_fixed.replace("/", "-") filename = str(target_directory / f"{oai_identifier_fixed}.xml") with open(filename, "w+") as f: f.write(record.raw) print(filename, "created")
def harvest(host, from_date, until, format, out, set, verbose): counter = 0 if verbose: logging.basicConfig(level=logging.DEBUG) else: logging.basicConfig(level=logging.INFO) logging.info("OAI-PMH harvesting from %s", host) logging.info("From date = %s", from_date) logging.info("Until date = %s", until) logging.info("Metadata format = %s", format) logging.info("Outfile = %s", out) mysickle = Sickle(host, iterator=OAIItemIterator) params = {'metadataPrefix': format, 'from': from_date, 'until': until} if set is not None: params['set'] = set try: responses = mysickle.ListIdentifiers(**params) except NoRecordsMatch: logging.info("No records harvested: the combination of the values of " "the arguments results in an empty list.") sys.exit() identifier_list = [] for records in responses: identifier_list.append(records.identifier) logging.info(f"Identifier count to harvest: {len(identifier_list)}") with open(out, 'wb') as f: f.write('<records>'.encode()) for identifier in identifier_list: r = mysickle.GetRecord(identifier=identifier, metadataPrefix=format) f.write(r.raw.encode('utf8')) logging.debug(counter) logging.debug(r.raw) counter += 1 f.write('</records>'.encode()) logging.info("Total records harvested: %i", counter)
def oai_get_record(id, name, transformation, record_cls, access_token=None, identifier=None, dbcommit=False, reindex=False, test_md5=False, verbose=False, debug=False, **kwargs): """Get record from an OAI repo. :param identifier: identifier of record. """ url, metadata_prefix, lastrun, setspecs = get_info_by_oai_name(name) request = Sickle(url) params = {} if access_token: params['accessToken'] = access_token params['metadataPrefix'] = metadata_prefix params['identifier'] = f'{identifier}{id}' try: record = request.GetRecord(**params) except Exception as err: if debug: raise Exception(err) return None records = parse_xml_to_array(StringIO(record.raw)) trans_record = transformation(records[0]).json if verbose: click.echo(f'OAI-{name} get: {id}') return trans_record
class TestCaseWrongEncoding(unittest.TestCase): def __init__(self, methodName='runTest'): super(TestCaseWrongEncoding, self).__init__(methodName) self.patch = mock.patch('sickle.app.requests.get', mock_get) def setUp(self): self.patch.start() self.sickle = Sickle('http://localhost') def tearDown(self): self.patch.stop() def test_GetRecord(self): oai_id = 'oai:test.example.com:1996652' record = self.sickle.GetRecord(identifier=oai_id) self.assertEqual(record.header.identifier, oai_id) self.assertIn(oai_id, record.raw) self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z') self.assertIsInstance(record.xml, etree._Element) binary_type(record) text_type(record) dict(record.header) self.assertEqual(dict(record), record.metadata) self.assertIn(u'某人', record.metadata['creator'])
class OAISynchronizer: """ """ def __init__(self, name, provider_code, oai_endpoint, metadata_prefix, set_, constant_fields: dict = None, parser: Callable = None, transformer=None, endpoints=None, default_endpoint: str = "recid", endpoint_mapping=None, pid_field=None, from_: str = None, endpoint_handler: dict = None, bulk: bool = True, pre_processors: dict = None, post_processors: dict = None, index: str = None): # Counters self.only_fetch = False self.deleted = 0 self.created = 0 self.modified = 0 if endpoint_mapping is None: # pragma: no cover endpoint_mapping = {} if pid_field is None: self.pid_field = current_app.config.get('PIDSTORE_RECID_FIELD', "recid") else: # pragma: no cover self.pid_field = pid_field self.name = name self.provider_code = provider_code self.metadata_prefix = metadata_prefix self.oai_endpoint = oai_endpoint self.oai_sync = None self.sickle = Sickle(self.oai_endpoint) self.parser = parser self.transformer = transformer self.endpoints = endpoints self.default_endpoint = default_endpoint self.endpoint_mapping = endpoint_mapping self.set_ = set_ if constant_fields: self.constant_fields = constant_fields else: self.constant_fields = {} self._from = None if from_: self.from_ = from_ self.endpoint_handler = endpoint_handler self.bulk = bulk self.pre_processors = pre_processors self.post_processors = post_processors self.overwrite = False self.es_client = current_search_client self._index = index @property def index(self): if self._index: _index = self._index else: _index = f"{self.provider_code}_{self.metadata_prefix}" if not self.es_client.indices.exists(_index): # pragma: no cover current_search_client.indices.create(index=_index, ignore=400, body={}) return _index @property def from_(self): return self._from @from_.setter def from_(self, value): if value == "latest": last_sync = OAISync.query.order_by(OAISync.id.desc()).first() if last_sync: self._from = arrow.get(last_sync) elif value is not None: if isinstance(value, arrow.Arrow): self._from = value else: self._from = arrow.get(value) else: self._from = None def run(self, start_oai: str = None, start_id: int = 0, break_on_error: bool = True, oai_id: Union[str, List[str]] = None, overwrite: bool = False, only_fetch: bool = False, index: str = None): """ :return: :rtype: """ if index: self._index = index self.only_fetch = only_fetch self.overwrite = overwrite self.restart_counters() with db.session.begin_nested(): self.oai_sync = OAISync( provider_code=self.provider_code, synchronizer_code=self.name, sync_start=arrow.utcnow(). datetime, # datetime.datetime.utcnow(), status="active", purpose="fetch" if only_fetch else "sync") db.session.add(self.oai_sync) db.session.commit() try: if oai_id: if isinstance(oai_id, str): oai_ids = [oai_id] elif isinstance(oai_id, list): oai_ids = oai_id else: # pragma: no cover raise Exception( "OAI identifier must be string or list of strings") self.synchronize(identifiers=oai_ids, break_on_error=break_on_error) self.update_oai_sync("ok") else: self.synchronize(start_oai=start_oai, start_id=start_id, break_on_error=break_on_error) self.update_oai_sync("ok") except: self.update_oai_sync("failed") raise finally: db.session.commit() def update_oai_sync(self, status): with db.session.begin_nested(): # self.oai_sync = db.session.merge(self.oai_sync) self.oai_sync.status = status self.oai_sync.sync_end = arrow.utcnow( ).datetime # datetime.datetime.utcnow() self.oai_sync.records_modified = self.modified self.oai_sync.records_created = self.created self.oai_sync.records_deleted = self.deleted if status == "failed": self.oai_sync.logs = traceback.format_exc() db.session.add(self.oai_sync) db.session.commit() def synchronize(self, identifiers=None, start_oai: str = None, start_id: int = 0, break_on_error: bool = True): # pragma: no cover """ :return: :rtype: """ logger.info( f"OAI harvester on endpoint: {self.oai_endpoint} has started!") if not self.bulk: identifiers = self._get_identifiers(identifiers, start_id) for idx, identifier in enumerate(identifiers, start=start_id): self.record_handling(idx, start_oai, break_on_error, identifier) else: records = self._get_records_iterator(start_id, list_identifiers=identifiers) print("Waiting for server...") t0 = datetime.now() for idx, record in enumerate(records, start=start_id): logger.debug(f"Time for record: {datetime.now()-t0}") t0 = datetime.now() self.record_handling(idx, start_oai, break_on_error, xml=record.xml) dt = datetime.now() - t0 logger.debug(f"Time for record_handling: {dt}") def _get_records_iterator(self, start_id: int = 0, list_identifiers: List[str] = None): if self.from_: records = self.sickle.ListRecords( **{ "metadataPrefix": self.metadata_prefix, "set": self.set_, "from": self.from_.format("YYYY-MM-DD") }) else: records = self.sickle.ListRecords( metadataPrefix=self.metadata_prefix, set=self.set_) if list_identifiers: return self.record_filter_generator(records, list_identifiers) else: return islice(records, start_id, None) def record_filter_generator(self, iterator, identifiers_list): for record in iterator: if record.header.identifier in identifiers_list: yield record def record_handling(self, idx, start_oai: str = None, break_on_error: bool = True, identifier: Header = None, xml: _Element = None, only_fetch: bool = None): if not only_fetch: only_fetch = self.only_fetch if not (identifier or xml): # pragma: no cover raise Exception("Must provide header or xml") if identifier and xml: # pragma: no cover raise Exception("You must provide only header or xml") if identifier: datestamp, deleted, oai_identifier = get_oai_header_data( identifier) else: datestamp, deleted, oai_identifier = get_oai_header_data(xml=xml) logger.info(f"{idx}. Record, OAI ID: '{oai_identifier}'") oai_rec = OAIRecord.get_record(oai_identifier) if not start_oai or oai_identifier == start_oai: # pragma: no cover TODO: vyřešit # start_oai/není implemntováno collect = True else: collect = False if not collect: # pragma: no cover return try: self.record_crud(oai_rec, timestamp=datestamp, deleted=deleted, idx=idx, oai_identifier=oai_identifier, xml=xml, only_fetch=only_fetch) except Exception: # pragma: no cover self.exception_handler(oai_identifier) if break_on_error: raise return def exception_handler(self, oai_identifier): exc = traceback.format_exc() print(exc, "\n\n\n") oai_exc = OAIRecordExc.query.filter_by( oai_identifier=oai_identifier, oai_sync_id=self.oai_sync.id).one_or_none() if not oai_exc: oai_exc = OAIRecordExc(oai_identifier=oai_identifier, traceback=exc, oai_sync_id=self.oai_sync.id) db.session.add(oai_exc) else: oai_exc.traceback = exc db.session.commit() def record_crud(self, oai_rec: OAIRecord = None, oai_identifier: str = None, timestamp: str = arrow.utcnow().isoformat(), deleted: bool = False, xml: _Element = None, idx: int = 0, only_fetch: bool = False): if not (oai_rec or oai_identifier): raise Exception("You have to provide oai_rec or oai_identifier") if not oai_identifier: oai_identifier = oai_rec.oai_identifier if only_fetch: if deleted: self.delete_es(oai_identifier) else: self.create_or_update_es(oai_identifier, xml=xml) else: if deleted: self._delete(oai_rec) else: try: self.create_or_update(oai_identifier, timestamp, oai_rec=oai_rec, xml=xml) except IdDoesNotExist: # pragma: no cover self._delete(oai_rec) if idx % 100: db.session.commit() def _get_identifiers(self, identifiers=None, start_id: int = 0): if identifiers is None: identifiers = self._get_oai_identifiers() else: identifiers = self._get_oai_identifiers( identifiers_list=identifiers) identifiers = islice(identifiers, start_id, None) return identifiers def _delete(self, oai_rec): if not oai_rec: return self.delete_record(oai_rec) self.deleted += 1 logger.info( f"Identifier '{oai_rec.oai_identifier}' has been marked as deleted" ) def _get_oai_identifiers(self, sickle=None, metadata_prefix=None, set_=None, identifiers_list: List[str] = None, from_: arrow.Arrow = None): if identifiers_list: return [ self.sickle.GetRecord( identifier=identifier, metadataPrefix=self.metadata_prefix).header for identifier in identifiers_list ] if not sickle: sickle = self.sickle if not metadata_prefix: metadata_prefix = self.metadata_prefix if not set_: set_ = self.set_ if not from_: if self.from_: from_ = self.from_ else: return sickle.ListIdentifiers(metadataPrefix=metadata_prefix, set=set_) return sickle.ListIdentifiers( **{ "metadataPrefix": metadata_prefix, "set": set_, "from": from_.format("YYYY-MM-DD") }) def create_or_update(self, oai_identifier, datestamp: str, oai_rec=None, xml: _Element = None): if oai_rec: if not self.overwrite: our_datestamp = arrow.get(oai_rec.timestamp) oai_record_datestamp = arrow.get(datestamp) if our_datestamp >= oai_record_datestamp: logger.info( f'Record with oai_identifier "{oai_identifier}" already exists' ) return if not xml: xml = self.get_xml(oai_identifier) parsed = self.parse(xml) if self.pre_processors: for processor in self.pre_processors: parsed = processor(parsed) transformed = self.transform(parsed) if self.post_processors: for processor in self.post_processors: transformed = processor(transformed) transformed.update(self.constant_fields) if oai_rec is None: record, pid = self.create_record(transformed) oai_rec = OAIRecord( id=record.id, # oai_identifier=oai_identifier, creation_sync_id=self.oai_sync.id, pid=pid.pid_value) oai_identifier = OAIIdentifier(oai_record_id=oai_rec.id, oai_identifier=oai_identifier) self.created += 1 db.session.add(oai_rec) oai_rec.oai_identifiers.append(oai_identifier) logger.info( f"Identifier '{oai_identifier}' has been created and '{record.id}' has been " f"assigned as a UUID") else: record = self.update_record(oai_rec, transformed) self.modified += 1 oai_rec.modification_sync_id = self.oai_sync.id logger.info( f"Identifier '{oai_identifier}' has been updated (UUID: {record.id})" ) oai_rec.last_sync_id = self.oai_sync.id oai_rec.timestamp = arrow.get(datestamp).datetime return record def create_or_update_es(self, oai_identifier, xml: _Element = None, index: str = None): if not index: index = self.index if not xml: xml = self.get_xml(oai_identifier) parsed = transform_to_dict(self.parse(xml)) try: es_record = self.es_client.get(id=oai_identifier, index=index) except NotFoundError: es_record = None if es_record is None: self.es_client.create(index, oai_identifier, parsed) logger.info(f'Record {oai_identifier} was created in ES') else: self.es_client.update(index=index, id=oai_identifier, body={"doc": parsed}) logger.info(f'Record {oai_identifier} was updated in ES') return parsed def transform(self, parsed, handler=None): if not handler: handler = self.transformer.transform return handler(parsed) def get_xml(self, oai_identifier, retry=True): try: original_record = self.sickle.GetRecord( identifier=oai_identifier, metadataPrefix=self.metadata_prefix) except HTTPError: if retry: time.sleep(1) original_record = self.sickle.GetRecord( identifier=oai_identifier, metadataPrefix=self.metadata_prefix) else: raise return original_record.xml def parse(self, xml_etree, parser=None): if not parser or not callable(parser): if self.parser: parser = self.parser if parser is None: raise ParserNotFoundError( "No parser specified, please check entry points and parser designation by " "decorator @Decorators.parser or specify parser as function parameter." ) return parser(xml_etree) def create_record(self, data): endpoint_config = self.get_endpoint_config(data) minter = self.get_minter(data, endpoint_config=endpoint_config) record_class = self.get_record_class(data, endpoint_config=endpoint_config) indexer_class = self.get_indexer_class(data, endpoint_config=endpoint_config) # Create uuid for record record_uuid = uuid.uuid4() # Create persistent identifier pid = minter(record_uuid, data=data) # Create record try: record = record_class.create(data, id_=pid.object_uuid) except: db.session.rollback() raise else: db.session.commit() # Index the record if indexer_class: indexer_class().index(record) return record, pid def update_record(self, oai_rec, data): endpoint_config = self.get_endpoint_config(data) indexer_class = self.get_indexer_class(data, endpoint_config=endpoint_config) record_class = self.get_record_class(data, endpoint_config=endpoint_config) fetcher = self.get_fetcher(data) try: record = record_class.get_record(oai_rec.id) except NoResultFound: record = record_class.get_record(oai_rec.id, with_deleted=True) record.revert(-2) record.update(record.model.json) fetched_pid = fetcher(oai_rec.id, dict(record)) record.clear() record.update(data) record[self.pid_field] = fetched_pid.pid_value record.commit() db.session.commit() if indexer_class: indexer_class().index(record) return record def delete_record(self, oai_rec): if not oai_rec: return indexer_class = self.get_indexer_class() record = Record.get_record(oai_rec.id) record.delete() # TODO: rozmyslet se jak nakládat s PIDy # # mark all PIDs as DELETED # all_pids = PersistentIdentifier.query.filter( # PersistentIdentifier.object_uuid == record.id, # ).all() # for rec_pid in all_pids: # if not rec_pid.is_deleted(): # rec_pid.delete() db.session.commit() if indexer_class: indexer_class().delete(record) def get_endpoint_config(self, data): endpoint_name = None if not data: data = {} if self.endpoint_mapping: endpoint_name = self.endpoint_mapping["mapping"].get( data.get(self.endpoint_mapping["field_name"])) if not endpoint_name and self.endpoint_handler: provider = self.endpoint_handler.get(self.provider_code) if provider: handler = provider.get(self.metadata_prefix) if handler: endpoint_name = handler(data) draft_configs = current_app.config.get("RECORDS_DRAFT_ENDPOINTS") if draft_configs: draft_endpoint_config = draft_configs.get(endpoint_name) if draft_endpoint_config: draft_endpoint_name = draft_endpoint_config.get("draft") if draft_endpoint_name: endpoint_name = draft_endpoint_name endpoint_config = self.endpoints.get( endpoint_name) or self.endpoints.get(self.default_endpoint) return endpoint_config def get_minter(self, data=None, endpoint_config=None): if not endpoint_config: endpoint_config = self.get_endpoint_config(data) minter_name = endpoint_config["pid_minter"] return current_pidstore.minters.get(minter_name) def get_fetcher(self, data=None, endpoint_config=None): if not endpoint_config: endpoint_config = self.get_endpoint_config(data) fetcher_name = endpoint_config["pid_fetcher"] return current_pidstore.fetchers.get(fetcher_name) def get_record_class(self, data=None, endpoint_config=None): if not endpoint_config: endpoint_config = self.get_endpoint_config(data) record_class = endpoint_config["record_class"] return obj_or_import_string(record_class) def get_indexer_class(self, data=None, endpoint_config=None): if not endpoint_config: endpoint_config = self.get_endpoint_config(data) indexer_class = endpoint_config.get( "indexer_class", 'invenio_indexer.api.RecordIndexer') return obj_or_import_string(indexer_class) def restart_counters(self): self.deleted = 0 self.created = 0 self.modified = 0 def delete_es(self, oai_identifier): try: self.es_client.get(id=oai_identifier, index=self.index) self.es_client.delete(index=self.index, id=oai_identifier) except NotFoundError: pass
class TestCase(unittest.TestCase): def __init__(self, methodName='runTest'): super(TestCase, self).__init__(methodName) self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest) def setUp(self): self.patch.start() self.sickle = Sickle('http://localhost') def tearDown(self): self.patch.stop() def test_OAIResponse(self): response = self.sickle.harvest(verb='ListRecords', metadataPrefix='oai_dc') self.assertIsInstance(response.xml, etree._Element) self.assertIsInstance(response.raw, string_types) def test_broken_XML(self): response = self.sickle.harvest(verb='ListRecords', resumptionToken='ListRecordsBroken.xml') self.assertEqual(response.xml, None) self.assertIsInstance(response.raw, string_types) def test_ListRecords(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc') assert len([r for r in records]) == 8 def test_ListRecords_ignore_deleted(self): records = self.sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True) num_records = len([r for r in records]) assert num_records == 4 def test_ListSets(self): set_iterator = self.sickle.ListSets() sets = [s for s in set_iterator] self.assertEqual(131, len(sets)) dict(sets[0]) def test_ListMetadataFormats(self): mdf_iterator = self.sickle.ListMetadataFormats() mdfs = [mdf for mdf in mdf_iterator] self.assertEqual(5, len(mdfs)) dict(mdfs[0]) def test_ListIdentifiers(self): records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc') self.assertEqual(len([r for r in records]), 4) def test_ListIdentifiers_ignore_deleted(self): records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc', ignore_deleted=True) # There are 2 deleted headers in the test data num_records = len([r for r in records]) self.assertEqual(num_records, 2) def test_Identify(self): identify = self.sickle.Identify() self.assertTrue(hasattr(identify, 'repositoryName')) self.assertTrue(hasattr(identify, 'baseURL')) self.assertTrue(hasattr(identify, 'adminEmail')) self.assertTrue(hasattr(identify, 'earliestDatestamp')) self.assertTrue(hasattr(identify, 'deletedRecord')) self.assertTrue(hasattr(identify, 'granularity')) self.assertTrue(hasattr(identify, 'description')) self.assertTrue(hasattr(identify, 'oai_identifier')) self.assertTrue(hasattr(identify, 'sampleIdentifier')) dict(identify) def test_GetRecord(self): oai_id = 'oai:test.example.com:1996652' record = self.sickle.GetRecord(identifier=oai_id) self.assertEqual(record.header.identifier, oai_id) self.assertIn(oai_id, record.raw) self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z') self.assertIsInstance(record.xml, etree._Element) binary_type(record) text_type(record) dict(record.header) self.assertEqual(dict(record), record.metadata) # Test OAI-specific exceptions @raises(BadArgument) def test_badArgument(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument') @raises(CannotDisseminateFormat) def test_cannotDisseminateFormat(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='cannotDisseminateFormat') @raises(IdDoesNotExist) def test_idDoesNotExist(self): self.sickle.GetRecord(metadataPrefix='oai_dc', error='idDoesNotExist') @raises(NoSetHierarchy) def test_noSetHierarchy(self): self.sickle.ListSets(metadataPrefix='oai_dc', error='noSetHierarchy') @raises(BadResumptionToken) def test_badResumptionToken(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='badResumptionToken') @raises(NoRecordsMatch) def test_noRecordsMatch(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='noRecordsMatch') @raises(OAIError) def test_undefined_OAI_error_XML(self): self.sickle.ListRecords(metadataPrefix='oai_dc', error='undefinedError') def test_OAIResponseIterator(self): sickle = Sickle('fake_url', iterator=OAIResponseIterator) records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')] self.assertEqual(len(records), 4)
def harvest(self, request): ## harvest (Harvester object, request = [community, source, verb, mdprefix, mdsubset]) # Harvest all files with <mdprefix> and <mdsubset> from <source> via sickle module and store those to hard drive. # # Parameters: # ----------- # (list) request - A list with following items: # 1. community # 2. source (OAI URL) # 3. verb (ListIdentifiers, ListRecords or JSONAPI) # 4. mdprefix (OAI md format as oai_dc, iso etc.) # 5. mdsubset # # Return Values: # -------------- # 1. (integer) is -1 if something went wrong # create a request dictionary: req = { "community": request[0], "url": request[1], "lverb": request[2], "mdprefix": request[3], "mdsubset": request[4] if len(request) > 4 else None } # create dictionary with stats: resKeys = ['count', 'tcount', 'ecount', 'time'] results = dict.fromkeys(resKeys, 0) stats = { "tottcount": 0, # total number of provided datasets "totcount": 0, # total number of successful harvested datasets "totecount": 0, # total number of failed datasets "totdcount": 0, # total number of all deleted datasets "tcount": 0, # number of all provided datasets per subset "count": 0, # number of all successful harvested datasets per subset "ecount": 0, # number of all failed datasets per subset "dcount": 0, # number of all deleted datasets per subset "timestart": time.time(), # start time per subset process } # the gbif api client class GBIF_CLIENT(object): # call action api: ## GBIF.action('package_list',{}) def __init__(self, api_url): ##, api_key): self.api_url = api_url self.logger = logging.getLogger('root') def JSONAPI(self, action, offset, chunklen, key): ## JSONAPI (action) - method return self.__action_api(action, offset, chunklen, key) def __action_api(self, action, offset, chunklen, key): # Make the HTTP request for get datasets from GBIF portal response = '' rvalue = 0 ## offset = 0 limit = chunklen ## None for DataCite-JSON !!! api_url = self.api_url if key: action_url = "{apiurl}/{action}/{key}".format( apiurl=api_url, action=action, key=str(key)) elif offset == None: action_url = "{apiurl}/{action}".format(apiurl=api_url, action=action) else: action_url = "{apiurl}/{action}?offset={offset}&limit={limit}".format( apiurl=api_url, action=action, offset=str(offset), limit=str(limit)) self.logger.debug('action_url: %s' % action_url) try: request = Request(action_url) response = urlopen(request) except HTTPError as e: self.logger.error( '%s : The server %s couldn\'t fulfill the action %s.' % (e.code, self.api_url, action)) if (e.code == 403): self.logger.critical( 'Access forbidden, maybe the API key is not valid?' ) exit(e.code) elif (e.code == 409): self.logger.critical( 'Maybe you have a parameter error?') return {"success": False} elif (e.code == 500): self.logger.critical('Internal server error') exit(e.code) except URLError as e: exit('%s' % e.reason) else: out = json.loads(response.read()) assert response.code >= 200 return out requests_log = logging.getLogger("requests") requests_log.setLevel(logging.WARNING) # if the number of files in a subset dir is greater than <count_break> # then create a new one with the name <set> + '_' + <count_set> count_break = 5000 count_set = 1 start = time.time() # set subset: mdsubset = req["mdsubset"] if (not mdsubset): subset = 'SET' elif mdsubset.endswith( '_' ): # no OAI subsets, but different OAI-URLs for same community subset = mdsubset[:-1] mdsubset = None elif len(mdsubset) > 2 and mdsubset[-1].isdigit( ) and mdsubset[-2] == '_': subset = mdsubset[:-2] else: subset = mdsubset if req["community"] == "b2share" or re.match( r'http(.*?)b2share(.*?)api(.*?)', req["url"]): setMapFile = '%s/mapfiles/b2share_mapset.json' % (os.getcwd()) elif req["community"] == "dara" and req[ "url"] == "https://www.da-ra.de/oaip/oai": setMapFile = '%s/mapfiles/dara_mapset.json' % (os.getcwd()) else: setMapFile = None if setMapFile: with open(setMapFile) as sm: setMap = json.load(sm) if mdsubset in setMap: mdsubset = setMap[mdsubset] if (self.fromdate): subset = subset + '_f' + self.fromdate self.logger.debug(' |- Subset: \t%s' % subset) # make subset dir: subsetdir = '/'.join([ self.base_outdir, req['community'] + '-' + req['mdprefix'], subset + '_' + str(count_set) ]) noffs = 0 # set to number of record, where harvesting should start stats['tcount'] = noffs fcount = 0 oldperc = 0 ntotrecs = 0 choffset = 0 chunklen = 1000 pageno = 1 records = list() ## JSON-API jsonapi_verbs = ['dataset', 'works', 'records'] if req["lverb"] in jsonapi_verbs: GBIF = GBIF_CLIENT(req['url']) # create GBIF object harvestreq = getattr(GBIF, 'JSONAPI', None) outtypedir = 'hjson' outtypeext = 'json' if mdsubset and req["lverb"] == 'works': haction = 'works?publisher-id=' + mdsubset dresultkey = 'data' elif req["lverb"] == 'records': haction = req["lverb"] if mdsubset: haction += '?q=community:' + mdsubset + '&size=' + str( chunklen) + '&page=' + str(pageno) dresultkey = 'hits' else: haction = req["lverb"] dresultkey = 'results' try: chunk = harvestreq( **{ 'action': haction, 'offset': None, 'chunklen': chunklen, 'key': None }) self.logger.debug(" Got first %d records : chunk['data'] %s " % (chunklen, chunk[dresultkey])) except (HTTPError, ConnectionError, Exception) as e: self.logger.critical( "%s :\n\thaction %s\n\tharvest request %s\n" % (e, haction, req)) return -1 if req["lverb"] == 'dataset': while ('endOfRecords' in chunk and not chunk['endOfRecords']): if 'results' in chunk: records.extend(chunk['results']) choffset += chunklen chunk = harvestreq( **{ 'action': haction, 'offset': choffset, 'chunklen': chunklen, 'key': None }) self.logger.debug( " Got next records [%d,%d] from chunk %s " % (choffset, choffset + chunklen, chunk)) elif req["lverb"] == 'records': records.extend(chunk['hits']['hits']) while ('hits' in chunk and 'next' in chunk['links']): if 'hits' in chunk: records.extend(chunk['hits']['hits']) pageno += 1 chunk = harvestreq( **{ 'action': haction, 'page': pageno, 'size': chunklen, 'key': None }) self.logger.debug( " Got next records [%d,%d] from chunk %s " % (choffset, choffset + chunklen, chunk)) else: if 'data' in chunk: records.extend(chunk['data']) # OAI-PMH (verb = ListRecords/Identifier ) elif req["lverb"].startswith('List'): sickle = Sickle(req['url'], max_retries=3, timeout=300) outtypedir = 'xml' outtypeext = 'xml' harvestreq = getattr(sickle, req["lverb"], None) try: records, rc = tee( harvestreq( **{ 'metadataPrefix': req['mdprefix'], 'set': mdsubset, 'ignore_deleted': True, 'from': self.fromdate })) except (HTTPError, ConnectionError) as err: self.logger.critical("%s during connecting to %s\n" % (err, req['url'])) return -1 except (ImportError, etree.XMLSyntaxError, CannotDisseminateFormat, Exception) as err: self.logger.critical("%s during harvest request %s\n" % (err, req)) return -1 # CSW2.0 elif req["lverb"].startswith('csw'): outtypedir = 'xml' outtypeext = 'xml' startposition = 0 maxrecords = 20 try: src = CatalogueServiceWeb(req['url']) NS = Namespaces() namespaces = NS.get_namespaces() if req['mdprefix'] == 'iso19139' or req['mdprefix'] == 'own': nsp = namespaces['gmd'] else: nsp = namespaces['csw'] harvestreq = getattr(src, 'getrecords2') chunk = harvestreq( **{ 'esn': 'full', 'startposition': choffset, 'maxrecords': maxrecords, 'outputschema': nsp }) chunklist = list(src.records.items()) while (len(chunklist) > 0): records.extend(chunklist) choffset += maxrecords chunk = harvestreq( **{ 'esn': 'full', 'startposition': choffset, 'maxrecords': maxrecords, 'outputschema': nsp }) chunklist = list(src.records.items()) self.logger.debug( " Got next %s records [%d,%d] from chunk " % (nsp, choffset, choffset + chunklen)) except (HTTPError, ConnectionError) as err: self.logger.critical("%s during connecting to %s\n" % (err, req['url'])) return -1 except (ImportError, CannotDisseminateFormat, Exception) as err: self.logger.error("%s : During harvest request %s\n" % (err, req)) ##return -1 # SparQL elif req["lverb"].startswith('Sparql'): outtypedir = 'hjson' outtypeext = 'json' startposition = 0 maxrecords = 1000 try: src = SPARQLWrapper(req['url']) harvestreq = getattr(src, 'query', 'format') ## statement = ''' prefix cpmeta: <http://meta.icos-cp.eu/ontologies/cpmeta/> prefix prov: <http://www.w3.org/ns/prov#> select (str(?submTime) as ?time) ?dobj ?spec ?dataLevel ?fileName ?submitterName where{ ?dobj cpmeta:hasObjectSpec [rdfs:label ?spec ; cpmeta:hasDataLevel ?dataLevel]. ?dobj cpmeta:hasName ?fileName . ?dobj cpmeta:wasSubmittedBy ?submission . ?submission prov:endedAtTime ?submTime . ?submission prov:wasAssociatedWith [cpmeta:hasName ?submitterName]. } order by desc(?submTime) limit 1000 ''' src.setQuery(statement) src.setReturnFormat(JSON) records = harvestreq().convert()['results']['bindings'] except (HTTPError, ConnectionError) as err: self.logger.critical("%s during connecting to %s\n" % (err, req['url'])) return -1 except (ImportError, CannotDisseminateFormat, Exception) as err: self.logger.critical("%s during harvest request %s\n" % (err, req)) return -1 else: self.logger.critical(' Not supported harvest type %s' % req["lverb"]) sys.exit() self.logger.debug(" Harvest method used %s" % req["lverb"]) try: if req["lverb"].startswith('List'): ntotrecs = len(list(rc)) else: ntotrecs = len(records) except Exception as err: self.logger.error('%s Iteratation does not work ?' % (err)) print("\t|- Retrieved %d records in %d sec - write %s files to disc" % (ntotrecs, time.time() - start, outtypeext.upper())) if ntotrecs == 0: self.logger.warning("\t|- Can not access any records to harvest") return -1 self.logger.debug(' | %-4s | %-25s | %-25s |' % ('#', 'OAI Identifier', 'DS Identifier')) start2 = time.time() if (not os.path.isdir(subsetdir + '/' + outtypedir)): os.makedirs(subsetdir + '/' + outtypedir) delete_ids = list() # loop over records for record in records: ## counter and progress bar stats['tcount'] += 1 fcount += 1 if fcount <= noffs: continue if ntotrecs > 0: perc = int(fcount * 100 / ntotrecs) bartags = int(perc / 5) if perc % 10 == 0 and perc != oldperc: oldperc = perc print("\r\t[%-20s] %5d (%3d%%) in %d sec" % ('=' * bartags, fcount, perc, time.time() - start2)) sys.stdout.flush() # Set oai_id and generate a uniquely identifier for this dataset: delete_flag = False if req["lverb"] == 'dataset' or req["lverb"] == 'works' or req[ "lverb"] == 'records': ## Harvest via JSON-API if 'key' in record: oai_id = record['key'] elif 'id' in record: oai_id = record['id'] elif req["lverb"] == 'csw': ## Harvest via CSW2.0 if hasattr(record, 'identifier'): oai_id = record.identifier elif (record): oai_id = record[0] else: self.logger.critical( 'Record %s has no attrribute identifier %s' % record) elif req[ "lverb"] == 'ListIdentifiers': ## OAI-PMH harvesting of XML records if (record.deleted): stats['totdcount'] += 1 delete_flag = True ##HEW-D continue else: oai_id = record.identifier record = sickle.GetRecord( **{ 'metadataPrefix': req['mdprefix'], 'identifier': record.identifier }) elif req["lverb"] == 'ListRecords': if (record.header.deleted): stats['totdcount'] += 1 continue else: oai_id = record.header.identifier elif req["lverb"].startswith('Sparql'): oai_id = record['fileName']['value'] # generate a uniquely identifier and a filename for this dataset: uid = str(uuid.uuid5(uuid.NAMESPACE_DNS, oai_id)) outfile = '%s/%s/%s.%s' % (subsetdir, outtypedir, os.path.basename(uid), outtypeext) if delete_flag: # record marked as deleted on provider site jsonfile = '%s/%s/%s.%s' % (subsetdir, 'json', os.path.basename(uid), 'json') # remove xml and json file: os.remove(xmlfile) os.remove(jsonfile) delete_ids.append(uid) # write record on disc try: self.logger.debug(' | h | %-4d | %-45s | %-45s |' % (stats['count'] + 1, oai_id, uid)) self.logger.debug( 'Try to write the harvested JSON record to %s' % outfile) if outtypeext == 'xml': # get and write the XML content: if req["lverb"] == 'csw': metadata = etree.fromstring(record[1].xml) elif hasattr(record, 'raw'): metadata = etree.fromstring(record.raw) elif hasattr(record, 'xml'): metadata = etree.fromstring(record.xml) if (metadata is not None): try: metadata = etree.tostring( metadata, pretty_print=True).decode('utf-8') except (Exception, UnicodeEncodeError) as e: self.logger.critical('%s : Metadata: %s ...' % (e, metadata[:20])) ##if PY2 : ## try: ## metadata = metadata.encode('utf-8') ## except (Exception,UnicodeEncodeError) as e : ## self.logger.debug('%s : Metadata : %s ...' % (e,metadata[20])) try: f = open(outfile, 'w') f.write(metadata) f.close except (Exception, IOError) as err: self.logger.critical( "%s : Cannot write metadata in xml file %s" % (err, outfile)) stats['ecount'] += 1 continue else: logging.debug('Harvested XML file written to %s' % outfile) stats['count'] += 1 else: stats['ecount'] += 1 self.logger.error('No metadata available for %s' % record) elif outtypeext == 'json': # get the raw json content: if (record is not None): try: with open(outfile, 'w') as f: json.dump(record, f, sort_keys=True, indent=4) except IOError: logging.error( "[ERROR] Cannot write metadata in out file '%s': %s\n" % (outfile)) stats['ecount'] += 1 continue else: stats['count'] += 1 logging.debug('Harvested JSON file written to %s' % outfile) else: stats['ecount'] += 1 logging.warning( ' [WARNING] No metadata available for %s' % record['key']) ##HEW-???' % oai_id) except TypeError as e: logging.error(' [ERROR] TypeError: %s' % e) stats['ecount'] += 1 continue except Exception as e: logging.error(" [ERROR] %s and %s" % (e, traceback.format_exc())) ## logging.debug(metadata) stats['ecount'] += 1 continue # Next or last subset? if (stats['count'] == count_break) or (fcount == ntotrecs): print(' | %d records written to subset directory %s ' % (stats['count'], subsetdir)) # clean up current subset and write ids to remove to delete file for df in os.listdir(subsetdir + '/' + outtypedir): df = os.path.join(subsetdir + '/' + outtypedir, df) logging.debug('File to delete : %s' % df) id = os.path.splitext(os.path.basename(df))[0] jf = os.path.join(subsetdir + '/json/', id + '.json') if os.stat(df).st_mtime < start - 1 * 86400: os.remove(df) logging.warning('File %s is deleted' % df) if os.path.exists(jf): os.remove(jf) logging.warning('File %s is deleted' % jf) delete_ids.append(id) logging.warning('Append Id %s to list delete_ids' % id) stats['dcount'] += 1 print(' | %d records deleted from subset directory %s ' % (stats['dcount'], subsetdir)) if not fcount == ntotrecs: # next subset neded subsetdir = self.save_subset(req, stats, subset, count_set) if (not os.path.isdir(subsetdir + '/' + outtypedir)): os.makedirs(subsetdir + '/' + outtypedir) count_set += 1 # add all subset stats to total stats and reset the temporal subset stats: for key in ['tcount', 'ecount', 'count', 'dcount']: stats['tot' + key] += stats[key] stats[key] = 0 # start with a new time: stats['timestart'] = time.time() logging.debug( ' | %d records written to subset directory %s (if not failed).' % (stats['count'], subsetdir)) # path to the file with all ids to delete: delete_file = '/'.join([ self.base_outdir, 'delete', req['community'] + '-' + req['mdprefix'] + '.del' ]) if len(delete_ids) > 0: with open(delete_file, 'a') as file: for id in delete_ids: file.write(id + '\n') # add all subset stats to total stats and reset the temporal subset stats: for key in ['tcount', 'ecount', 'count', 'dcount']: stats['tot' + key] += stats[key] print( ' \t|- %-10s |@ %-10s |\n\t| Provided | Harvested | Failed | Deleted |\n\t| %8d | %9d | %6d | %6d |' % ('Finished', time.strftime("%H:%M:%S"), stats['tottcount'], stats['totcount'], stats['totecount'], stats['totdcount']))
class OAIRepository(HarvestRepository): """ OAI Repository """ def setRepoParams(self, repoParams): self.metadataprefix = "oai_dc" self.default_language = "en" super(OAIRepository, self).setRepoParams(repoParams) self.sickle = Sickle(self.url, iterator=FRDRItemIterator) def _crawl(self): records = [] try: if self.set is None or self.set == "": records = self.sickle.ListRecords( metadataPrefix=self.metadataprefix, ignore_deleted=True) else: records = self.sickle.ListRecords( metadataPrefix=self.metadataprefix, ignore_deleted=True, set=self.set) except: self.logger.info("No items were found") kwargs = { "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "oai", "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern, "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run, "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days, "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url, "repo_oai_name": self.repo_oai_name } self.repository_id = self.db.update_repo(**kwargs) item_count = 0 while records: try: record = records.next() metadata = record.metadata # Search for a hyperlink in the list of identifiers if 'identifier' in metadata.keys(): if not isinstance(metadata['identifier'], list): metadata['identifier'] = [metadata['identifier']] for idt in metadata['identifier']: # TODO - what about multiple identifiers? We should have some priority here, so we always pick the same one regardless of ordering if idt.lower().startswith("http"): metadata['dc:source'] = idt if idt.lower().startswith("doi:"): metadata[ 'dc:source'] = "https://doi.org/" + idt[4:] if idt.lower().startswith("hdl:"): metadata[ 'dc:source'] = "https://hdl.handle.net/" + idt[ 4:] # EPrints workaround for using header datestamp in lieu of date if 'date' not in metadata.keys() and record.header.datestamp: metadata["date"] = record.header.datestamp # Use the header id for the database key (needed later for OAI GetRecord calls) metadata['identifier'] = record.header.identifier oai_record = self.unpack_oai_metadata(metadata) self.domain_metadata = self.find_domain_metadata(metadata) self.db.write_record(oai_record, self) item_count = item_count + 1 if (item_count % self.update_log_after_numitems == 0): tdelta = time.time() - self.tstart + 0.1 self.logger.info( "Done {} items after {} ({:.1f} items/sec)".format( item_count, self.formatter.humanize(tdelta), (item_count / tdelta))) except AttributeError: # probably not a valid OAI record # Islandora throws this for non-object directories self.logger.debug( "AttributeError while working on item {}".format( item_count)) pass except StopIteration: break self.logger.info("Processed {} items in feed".format(item_count)) def unpack_oai_metadata(self, record): record["pub_date"] = record.get("date") if self.metadataprefix.lower() == "ddi": # TODO: better DDI implementation that doesn't simply flatten everything, see: https://sickle.readthedocs.io/en/latest/customizing.html # Mapping as per http://www.ddialliance.org/resources/ddi-profiles/dc record["title"] = record.get("titl") record["creator"] = record.get("AuthEnty") record["tags"] = record.get("keyword", []) if "topcClas" in record.keys() and len(record["topcClas"]) > 0: record['tags'].extend(filter(None, record["topcClas"])) record["description"] = record.get("abstract") record["publisher"] = record.get("producer") record["contributor"] = record.get("othId") record["pub_date"] = record.get("prodDate") record["type"] = record.get("dataKind") record["identifier"] = record.get("IDNo") record["rights"] = record.get("copyright") if self.metadataprefix.lower() == "fgdc" or self.metadataprefix.lower( ) == "fgdc-std": record["creator"] = [] for creator in record.get("origin"): if creator not in record["creator"]: record["creator"].append(creator) record["tags"] = record.get("themekey") record["description"] = record.get("abstract") record["publisher"] = record.get("cntorg") # Put these dates in preferred order record["pub_date"] = [ record.get("pubdate"), record.get("begdate"), record.get("enddate") ] record["type"] = record.get("geoform") record["dc:source"] = record.get("onlink") record["rights"] = record.get("distliab") record["access"] = record.get("accconst") if "placekt" in record.keys(): record["coverage"] = record["placekt"] if "bounding" in record.keys(): record["geobboxes"] = [{ "westLon": record["westbc"][0], "eastLon": record["eastbc"][0], "northLat": record["northbc"][0], "southLat": record["southbc"][0] }] # Parse FRDR records if self.metadataprefix.lower() == "frdr": if "http://datacite.org/schema/kernel-4#geolocationPlace" in record.keys( ): record["coverage"] = record.get( "http://datacite.org/schema/kernel-4#geolocationPlace") if "http://datacite.org/schema/kernel-4#geolocationPoint" in record.keys( ): record["geopoints"] = [] for geopoint in record[ "http://datacite.org/schema/kernel-4#geolocationPoint"]: point_split = re.compile(",? ").split(geopoint) if len(point_split) == 2: record["geopoints"].append({ "lat": point_split[0], "lon": point_split[1] }) if "http://datacite.org/schema/kernel-4#geolocationBox" in record.keys( ): record["geobboxes"] = [] for geobbox in record[ "http://datacite.org/schema/kernel-4#geolocationBox"]: boxcoordinates = geobbox.split() if len(boxcoordinates) == 4: record["geobboxes"].append({ "southLat": boxcoordinates[0], "westLon": boxcoordinates[1], "northLat": boxcoordinates[2], "eastLon": boxcoordinates[3] }) # Look for datacite.creatorAffiliation if "http://datacite.org/schema/kernel-4#creatorAffiliation" in record: record["affiliation"] = record.get( "http://datacite.org/schema/kernel-4#creatorAffiliation") if 'identifier' not in record.keys(): return None if record["pub_date"] is None: return None # If there are multiple identifiers, and one of them contains a link, then prefer it # Otherwise just take the first one if isinstance(record["identifier"], list): valid_id = record["identifier"][0] for idstring in record["identifier"]: if "http" in idstring.lower(): valid_id = idstring record["identifier"] = valid_id if 'creator' not in record.keys() and 'contributor' not in record.keys( ) and 'publisher' not in record.keys(): self.logger.debug( "Item {} is missing creator - will not be added".format( record["identifier"])) return None elif 'creator' not in record.keys() and 'contributor' in record.keys(): record["creator"] = record["contributor"] elif 'creator' not in record.keys() and 'publisher' in record.keys(): record["creator"] = record["publisher"] # Workaround for WOUDC, which doesn't attribute individual datasets elif self.metadataprefix.lower() == "fgdc-std": record["creator"] = self.name # If date is undefined add an empty key if 'pub_date' not in record.keys(): record["pub_date"] = "" # If there are multiple dates choose the longest one (likely the most specific) # If there are a few dates with the same length the first one will be used, which assumes we grabbed them in a preferred order # Exception test added for some strange PDC dates of [null, null] if isinstance(record["pub_date"], list): valid_date = record["pub_date"][0] or "" for datestring in record["pub_date"]: if datestring is not None: if len(datestring) > len(valid_date): valid_date = datestring record["pub_date"] = valid_date # If date is still a one-value list, make it a string if isinstance(record["pub_date"], list): record["pub_date"] = record["pub_date"][0] # If a date has question marks, chuck it if "?" in record["pub_date"]: return None try: date_object = dateparser.parse(record["pub_date"]) if date_object is None: date_object = dateparser.parse(record["pub_date"], date_formats=['%Y%m%d']) record["pub_date"] = date_object.strftime("%Y-%m-%d") except: self.logger.debug( "Something went wrong parsing the date, {} from {}", record["pub_date"], (record["dc:source"] if record["identifier"] is None else record["identifier"])) return None if "title" not in record.keys(): return None language = self.default_language if "language" in record.keys(): if isinstance(record["language"], list): record["language"] = record["language"][0].strip() record["language"] = record["language"].lower() if record["language"] in ["fr", "fre", "fra", "french"]: language = "fr" if language == "fr": if isinstance(record["title"], list): record["title_fr"] = record["title"][0].strip() else: record["title_fr"] = record["title"].strip() # Remove "title" from record since this is the English field record["title"] = "" if "tags_fr" not in record.keys(): record["tags_fr"] = record.get("subject") record.pop("subject", None) else: if isinstance(record["title"], list): record["title"] = record["title"][0].strip() else: record["title"] = record["title"].strip() record["title_fr"] = "" if "tags" not in record.keys(): record["tags"] = record.get("subject") record.pop("subject", None) if "publisher" in record.keys(): if isinstance(record["publisher"], list): record["publisher"] = record["publisher"][0] if "series" not in record.keys(): record["series"] = "" if "coverage" in record.keys(): record["geoplaces"] = [] if self.name == "SFU Radar": record["coverage"] = [ x.strip() for x in record["coverage"][0].split(";") ] if not isinstance(record["coverage"], list): record["coverage"] = [record["coverage"]] for place_name in record["coverage"]: if place_name != "" and place_name.lower().islower( ): # to filter out dates, confirm at least one letter record["geoplaces"].append({"place_name": place_name}) # DSpace workaround to exclude theses and non-data content if self.prune_non_dataset_items: if record["type"] and "Dataset" not in record["type"]: return None # EPrints workaround to fix duplicates and Nones in Rights if "rights" in record.keys() and isinstance(record["rights"], list): record["rights"] = list(set(filter(None.__ne__, record["rights"]))) # EPrints workaround for liberal use of dc:identifier # Rather not hardcode a single source URL for this if self.url == "http://spectrum.library.concordia.ca/cgi/oai2": for relation in record["relation"]: if "http://spectrum.library.concordia.ca" in relation: record["dc:source"] = relation return record def find_domain_metadata(self, record): excludedElements = [ 'http://datacite.org/schema/kernel-4#resourcetype', 'http://datacite.org/schema/kernel-4#creatorAffiliation', 'http://datacite.org/schema/kernel-4#publicationyear', 'https://www.frdr-dfdr.ca/schema/1.0/#globusEndpointName', 'https://www.frdr-dfdr.ca/schema/1.0/#globusEndpointPath' ] newRecord = {} for elementName in list(record.keys()): if '#' in elementName: if not [ ele for ele in excludedElements if (ele in elementName) ]: newRecord[elementName] = record.pop(elementName, None) return newRecord @rate_limited(5) def _update_record(self, record): #self.logger.debug("Updating OAI record {}".format(record['local_identifier'])) try: single_record = self.sickle.GetRecord( identifier=record["local_identifier"], metadataPrefix=self.metadataprefix) try: metadata = single_record.metadata if 'identifier' in metadata.keys() and isinstance( metadata['identifier'], list): if "http" in metadata['identifier'][0].lower(): metadata['dc:source'] = metadata['identifier'][0] except AttributeError: metadata = {} # EPrints workaround for using header datestamp in lieu of date if 'date' not in metadata.keys( ) and single_record.header.datestamp: metadata["date"] = single_record.header.datestamp metadata['identifier'] = single_record.header.identifier oai_record = self.unpack_oai_metadata(metadata) self.domain_metadata = self.find_domain_metadata(metadata) if oai_record is None: self.db.delete_record(record) return False self.db.write_record(oai_record, self) return True except IdDoesNotExist: # Item no longer in this repo self.db.delete_record(record) return True except Exception as e: self.logger.error( "Updating item failed (repo_id:{}, oai_id:{}): {}".format( self.repository_id, record['local_identifier'], e)) if self.dump_on_failure == True: try: print(single_record.metadata) except: pass # Touch the record so we do not keep requesting it on every run self.db.touch_record(record) self.error_count = self.error_count + 1 if self.error_count < self.abort_after_numerrors: return True return False
from oaiharvests.utils import * from oaiharvests.models import * from sickle import Sickle community = Community.objects.all()[0] batch_harvest_issues(community) collection = Collection.objects.all()[7] record = collection.record_set.all()[5] record.hdr_identifier try: sickle = Sickle(collection.community.repository.base_url) sickle.class_mapping['GetRecord'] = LltRecordBitstream record = sickle.GetRecord(metadataPrefix='ore', identifier=record.identifier) print type(record) # print record.metadata['bitstream'][0].replace('+', '%20') except Exception as e: print e, 'Unable to construct bitstream url.' #
class OAIClient: def __init__(self, url, source_name, max_retries=3): self.sickle = Sickle(url, max_retries=max_retries, verify=False) self.sickle.class_mapping['ListRecords'] = SciELORecord self.sickle.class_mapping['GetRecord'] = SciELORecord self.source_name = source_name def get_record(self, metadata_prefix='oai_dc_scielo', identifier=None): if identifier: return [ self.sickle.GetRecord(**{ 'metadataPrefix': metadata_prefix, 'identifier': identifier }) ] def get_records(self, metadata_prefix='oai_dc_scielo', from_date='', until_date=''): try: from_date = datetime.strptime(from_date, '%Y-%m-%d') until_date = datetime.strptime(until_date, '%Y-%m-%d') except ValueError: raise exceptions.InvalidDateFormatError( 'Formato de datas inválido') if from_date >= until_date: raise exceptions.InvalidDateRangeError( 'Data de início é maior ou igual a data de fim') logging.info( f'Collecting data from {from_date.strftime("%Y-%m-%d")} to {until_date.strftime("%Y-%m-%d")}' ) try: records = self.sickle.ListRecords( **{ 'metadataPrefix': metadata_prefix, 'from': from_date.strftime('%Y-%m-%d'), 'until': until_date.strftime('%Y-%m-%d') }) except NoRecordsMatch: logging.info('No records found') return [] except ( ConnectionError, ConnectionResetError, ConnectionAbortedError, ConnectionRefusedError, MaxRetryError, HTTPError, TimeoutError, ) as e: logging.error(e) return [] return records def record_to_dict(self, record: SciELORecord): object = {} object['gathering_date'] = datetime.utcnow() object['gathering_source'] = self.source_name object['identifier'] = record.header.identifier object['date'] = record.header.date object['is_part_of'] = record.header.is_part_of object['metadata'] = record.get_metadata().get('metadata', {}) return object
# oai = OAIUtils() # oai.list_oai_collections(com) base_url = 'http://scholarspace.manoa.hawaii.edu/dspace-oai/request' llt_id = 'com_10125_27123' s = Sickle(base_url) record_headers = list(s.ListIdentifiers(metadataPrefix='oai_dc', set=llt_id)) community_collections = {} for i in record_headers: # Iterate over associated sets looking for collections for j in i.setSpecs: if j[:3] == 'col': community_collections[j] = None # register id in map for i in s.ListSets(): try: print community_collections[i.setSpec] community_collections[i.setSpec] = i.setName print i.setSpec, '==>', community_collections[i.setSpec] print i except KeyError as e: pass # print e, 'not a collection in llt ...' sample = 'oai:scholarspace.manoa.hawaii.edu:10125/54329' s.GetRecord(identifier=sample, metadataPrefix='oai_dc')
class OAISynchronizer(OAIDBBase): """ """ def __init__(self, provider: OAIProvider, parser_name: str = None, unhandled_paths: set = None, validation: Callable = None, create_record: Callable = None, delete_record: Callable = None, update_record: Callable = None, pid_type: str = None, oai_identifiers: List[str] = None): super().__init__(provider) self.pid_type = pid_type self.provider = provider self.oai_sync = None self.sickle = Sickle(self.provider.oai_endpoint) registry.load() self.parsers = provider.get_parsers() self.rules = provider.get_rules(parser_name) or {} self.parser = self.parsers.get(parser_name) or {} self.transformer = OAITransformer(self.rules, unhandled_paths=unhandled_paths) self.validation_handler = validation self.create_record_handler = create_record self.update_record_handler = update_record self.delete_record_handler = delete_record self.oai_identifiers = oai_identifiers def run(self, start_oai: str = None, start_id: int = None, break_on_error: bool = True): """ :return: :rtype: """ self.ensure_migration() super().run(start_oai=start_oai, start_id=start_id, break_on_error=break_on_error) def synchronize(self, identifiers=None, start_oai: str = None, start_id: int = None, break_on_error: bool = True): """ :return: :rtype: """ oai_logger.info( f"OAI harvester on endpoint: {self.provider.oai_endpoint} has started!" ) if identifiers is None: if self.oai_identifiers is None: identifiers = self._get_oai_identifiers() else: identifiers = self._get_oai_identifiers( identifiers_list=self.oai_identifiers) identifiers = islice(identifiers, start_id, None) collect = False for idx, identifier in enumerate(identifiers, start=start_id): oai_logger.info(f"{idx}. Record, OAI ID: '{identifier}'") datestamp = identifier.datestamp oai_identifier = identifier.identifier if not start_oai or oai_identifier == start_oai: collect = True if not collect: continue deleted = identifier.deleted try: if deleted: self._delete(identifier, oai_identifier) else: try: self.update(oai_identifier, datestamp) except IdDoesNotExist: self._delete(identifier, oai_identifier) if idx % 100: db.session.commit() except Exception: exc = traceback.format_exc() print(exc, "\n\n\n") oai_exc = OAIRecordExc.query.filter_by( oai_identifier=oai_identifier, oai_sync_id=self.oai_sync.id).one_or_none() if not oai_exc: oai_exc = OAIRecordExc(oai_identifier=oai_identifier, traceback=exc, oai_sync_id=self.oai_sync.id) db.session.add(oai_exc) else: oai_exc.traceback = exc db.session.commit() if break_on_error: raise continue def _delete(self, identifier, oai_identifier): self.delete(oai_identifier) self.deleted += 1 oai_logger.info( f"Identifier '{identifier}' has been marked as deleted") def _get_oai_identifiers(self, sickle=None, metadata_prefix=None, set_=None, identifiers_list: List[str] = None): if identifiers_list: return [ self.sickle.GetRecord( identifier=identifier, metadataPrefix=self.provider.metadata_prefix).header for identifier in identifiers_list ] if not sickle: sickle = self.sickle if not metadata_prefix: metadata_prefix = self.provider.metadata_prefix if not set_: set_ = self.provider.set_ return sickle.ListIdentifiers(metadataPrefix=metadata_prefix, set=set_) def update(self, oai_identifier, datestamp): """ :param oai_identifier: :type oai_identifier: :param datestamp: :type datestamp: :return: :rtype: """ xml = self.get_xml(oai_identifier) parsed = self.parse(xml) transformed = self.transform(parsed) transformed.update(self.provider.constant_fields) if self.validation_handler: self.validation_handler(transformed) oai_rec = OAIRecord.query.filter_by( oai_identifier=oai_identifier).one_or_none() if oai_rec is None: transformed = self.attach_id(transformed) record = self.create_record(transformed) oai_rec = OAIRecord(id=record.id, oai_identifier=oai_identifier, creation_sync_id=self.oai_sync.id, nusl_id=transformed["id"]) self.created += 1 db.session.add(oai_rec) oai_logger.info( f"Identifier '{oai_identifier}' has been created and '{record.id}' has been " f"assigned as a UUID") else: transformed = self.attach_id(transformed, nusl_id=oai_rec.nusl_id) record = self.update_record(transformed) self.modified += 1 oai_rec.modification_sync_id = self.oai_sync.id oai_logger.info( f"Identifier '{oai_identifier}' has been updated (UUID: {record.id})" ) oai_rec.last_sync_id = self.oai_sync.id oai_rec.timestamp = datestamp nusl_theses.index_draft_record(record) def transform(self, parsed, handler=None): if not handler: handler = self.transformer.transform return handler(parsed) def get_xml(self, oai_identifier): original_record = self.sickle.GetRecord( identifier=oai_identifier, metadataPrefix=self.provider.metadata_prefix) return original_record.xml def parse(self, xml_etree, parser=None): if not parser or not callable(parser): if self.parser: parser = self.parser if parser is None: raise ParserNotFoundError( "No parser specified, please check entry points and parser designation by " "decorator @Decorators.parser or specify parser as function parameter." ) return parser(xml_etree) def create_record(self, metadata): """ :return: :rtype: """ if self.create_record_handler: record = self.create_record_handler(metadata, pid_type=self.pid_type) return record else: raise HandlerNotFoundError( 'Please specify create handler during initialization. Must specify ' '"create_record" named parameter') def update_record(self, metadata): """ :return: :rtype: """ if self.update_record_handler: existing_record = nusl_theses.get_record_by_id( self.pid_type, metadata["id"]) return self.update_record_handler(existing_record, metadata) else: raise HandlerNotFoundError( 'Please specify update handler during initialization. Must specify ' '"update_record" named parameter') def delete(self, oai_identifier): """ :param oai_identifier: :type oai_identifier: :return: :rtype: """ if self.delete_record_handler: oai_record = OAIRecord.query.filter_by( oai_identifier=oai_identifier).one_or_none() if not oai_record: return record = nusl_theses.get_record_by_id(pid_type=self.pid_type, pid_value=oai_record.nusl_id) self.delete_record_handler(record) else: raise HandlerNotFoundError( 'Please specify delete handler during initialization. Must specify ' '"delete_record" named parameter') @staticmethod def ensure_migration(): # TODO: Zlepšit kontrolu zda proběhla migrace úspěšně oai_record_count = OAIRecord.query.count() records_count = RecordMetadata.query.count() if records_count > 0 and oai_record_count == 0: raise NoMigrationError( "There are records presents in database, but no OAIRecord found. Please ensure " "that you run migration script") @staticmethod def attach_id(transformed, nusl_id=None): if not nusl_id: nusl_id = str(nusl_theses.get_new_pid()) transformed["id"] = nusl_id transformed["identifier"].append({"type": "nusl", "value": nusl_id}) return transformed
class OAIRepository(HarvestRepository): """ OAI Repository """ def setRepoParams(self, repoParams): self.metadataprefix = "oai_dc" super(OAIRepository, self).setRepoParams(repoParams) self.sickle = Sickle(self.url, iterator=FRDRItemIterator) def _crawl(self): records = [] try: if self.set is None or self.set == "": records = self.sickle.ListRecords( metadataPrefix=self.metadataprefix, ignore_deleted=True) else: records = self.sickle.ListRecords( metadataPrefix=self.metadataprefix, ignore_deleted=True, set=self.set) except: self.logger.info("No items were found") kwargs = { "repo_id": self.repository_id, "repo_url": self.url, "repo_set": self.set, "repo_name": self.name, "repo_type": "oai", "enabled": self.enabled, "repo_thumbnail": self.thumbnail, "item_url_pattern": self.item_url_pattern, "abort_after_numerrors": self.abort_after_numerrors, "max_records_updated_per_run": self.max_records_updated_per_run, "update_log_after_numitems": self.update_log_after_numitems, "record_refresh_days": self.record_refresh_days, "repo_refresh_days": self.repo_refresh_days, "homepage_url": self.homepage_url } self.repository_id = self.db.update_repo(**kwargs) item_count = 0 while records: try: record = records.next() metadata = record.metadata # Search for a hyperlink in the list of identifiers if 'identifier' in metadata.keys(): if not isinstance(metadata['identifier'], list): metadata['identifier'] = [metadata['identifier']] for idt in metadata['identifier']: # TODO - what about multiple identifiers? We should have some priority here, so we always pick the same one regardless of ordering if idt.lower().startswith("http"): metadata['dc:source'] = idt if idt.lower().startswith("doi:"): metadata[ 'dc:source'] = "https://dx.doi.org/" + idt[4:] if idt.lower().startswith("hdl:"): metadata[ 'dc:source'] = "https://hdl.handle.net/" + idt[ 4:] # EPrints workaround for using header datestamp in lieu of date if 'date' not in metadata.keys() and record.header.datestamp: metadata["date"] = record.header.datestamp # Use the header id for the database key (needed later for OAI GetRecord calls) metadata['identifier'] = record.header.identifier oai_record = self.unpack_oai_metadata(metadata) domain_metadata = self.find_domain_metadata(metadata) self.db.write_record(oai_record, self.repository_id, self.metadataprefix.lower(), domain_metadata) item_count = item_count + 1 if (item_count % self.update_log_after_numitems == 0): tdelta = time.time() - self.tstart + 0.1 self.logger.info( "Done {} items after {} ({:.1f} items/sec)".format( item_count, self.formatter.humanize(tdelta), (item_count / tdelta))) except AttributeError: # probably not a valid OAI record # Islandora throws this for non-object directories self.logger.debug( "AttributeError while working on item {}".format( item_count)) pass except StopIteration: break self.logger.info("Processed {} items in feed".format(item_count)) def unpack_oai_metadata(self, record): record["pub_date"] = record.get("date") if self.metadataprefix.lower() == "ddi": # TODO: better DDI implementation that doesn't simply flatten everything, see: https://sickle.readthedocs.io/en/latest/customizing.html # Mapping as per http://www.ddialliance.org/resources/ddi-profiles/dc record["title"] = record.get("titl") record["creator"] = record.get("AuthEnty") record["subject"] = record.get("keyword", []) if "topcClas" in record.keys() and len(record["topcClas"]) > 0: record['subject'].extend(filter(None, record["topcClas"])) record["description"] = record.get("abstract") record["publisher"] = record.get("producer") record["contributor"] = record.get("othId") record["pub_date"] = record.get("prodDate") record["type"] = record.get("dataKind") record["identifier"] = record.get("IDNo") record["rights"] = record.get("copyright") if "northBL" in record.keys(): # This record has geoSpatial bounding lines # Convert into an array of closed bounding box points (clockwise polygon) record["geospatial"] = { "type": "Polygon", "coordinates": [[[record["northBL"][0], record["westBL"][0]], [record["northBL"][0], record["eastBL"][0]], [record["southBL"][0], record["westBL"][0]], [record["southBL"][0], record["eastBL"][0]]]] } if self.metadataprefix.lower() == "fgdc" or self.metadataprefix.lower( ) == "fgdc-std": record["creator"] = record.get("origin") record["subject"] = record.get("themekey") record["description"] = record.get("abstract") record["publisher"] = record.get("cntorg") # Put these dates in preferred order record["pub_date"] = [ record.get("pubdate"), record.get("begdate"), record.get("enddate") ] record["type"] = record.get("geoform") record["dc:source"] = record.get("onlink") record["rights"] = record.get("distliab") record["access"] = record.get("accconst") if "bounding" in record.keys(): # Sometimes point data is hacked in as a bounding box if record["westbc"] == record["eastbc"] and record[ "northbc"] == record["southbc"]: record["geospatial"] = { "type": "Point", "coordinates": [[[record["northbc"][0], record["westbc"][0]]]] } else: record["geospatial"] = { "type": "Polygon", "coordinates": [[[record["northbc"][0], record["westbc"][0]], [record["northbc"][0], record["eastbc"][0]], [record["southbc"][0], record["westbc"][0]], [record["southbc"][0], record["eastbc"][0]]]] } # Parse FRDR records if self.metadataprefix.lower() == "frdr": record["coverage"] = record.get("geolocationPlace") if "geolocationPoint" in record.keys(): point_split = re.compile(",? ").split( record["geolocationPoint"][0]) record["geospatial"] = { "type": "Point", "coordinates": [[point_split]] } if "geolocationBox" in record.keys(): boxcoordinates = record["geolocationBox"][0].split() record["geospatial"] = { "type": "Polygon", "coordinates": [[ boxcoordinates[x:x + 2] for x in range(0, len(boxcoordinates), 2) ]] } # Look for datacite.creatorAffiliation if "creatorAffiliation" in record: record["affiliation"] = record.get("creatorAffiliation") if 'identifier' not in record.keys(): return None if record["pub_date"] is None: return None # If there are multiple identifiers, and one of them contains a link, then prefer it # Otherwise just take the first one if isinstance(record["identifier"], list): valid_id = record["identifier"][0] for idstring in record["identifier"]: if "http" in idstring.lower(): valid_id = idstring record["identifier"] = valid_id if 'creator' not in record.keys() and 'contributor' not in record.keys( ) and 'publisher' not in record.keys(): self.logger.debug( "Item {} is missing creator - will not be added".format( record["identifier"])) return None elif 'creator' not in record.keys() and 'contributor' in record.keys(): record["creator"] = record["contributor"] elif 'creator' not in record.keys() and 'publisher' in record.keys(): record["creator"] = record["publisher"] # Workaround for WOUDC, which doesn't attribute individual datasets elif self.metadataprefix.lower() == "fgdc-std": record["creator"] = self.name # If date is undefined add an empty key if 'pub_date' not in record.keys(): record["pub_date"] = "" # If there are multiple dates choose the longest one (likely the most specific) # If there are a few dates with the same length the first one will be used, which assumes we grabbed them in a preferred order # Exception test added for some strange PDC dates of [null, null] if isinstance(record["pub_date"], list): valid_date = record["pub_date"][0] or "" for datestring in record["pub_date"]: if datestring is not None: if len(datestring) > len(valid_date): valid_date = datestring record["pub_date"] = valid_date # If date is still a one-value list, make it a string if isinstance(record["pub_date"], list): record["pub_date"] = record["pub_date"][0] # Convert long dates into YYYY-MM-DD datestring = re.search("(\d{4}[-/]?\d{2}[-/]?\d{2})", record["pub_date"]) if datestring: record["pub_date"] = datestring.group(0).replace("/", "-") # If dates are entirely numeric, add separators if not re.search("\D", record["pub_date"]): if (len(record["pub_date"]) == 6): record["pub_date"] = record["pub_date"][0] + record["pub_date"][1] + record["pub_date"][2] + \ record["pub_date"][3] + "-" + record["pub_date"][4] + record["pub_date"][5] if (len(record["pub_date"]) == 8): record["pub_date"] = record["pub_date"][0] + record["pub_date"][1] + record["pub_date"][2] + \ record["pub_date"][3] + "-" + record["pub_date"][4] + record["pub_date"][5] + "-" + \ record["pub_date"][6] + record["pub_date"][7] # If a date has question marks, chuck it if "?" in record["pub_date"]: return None # Make sure dates are valid if not re.search( "^(1|2)\d{3}(-?(0[1-9]|1[0-2])(-?(0[1-9]|1[0-9]|2[0-9]|3[0-1]))?)?$", record["pub_date"]): self.logger.debug("Invalid date for record {}".format( record["dc:source"])) return None # record["pub_date"] = dateparser.parse(record["pub_date"]).strftime("%Y-%m-%d") if "title" not in record.keys(): return None if isinstance(record["title"], list): record["title"] = record["title"][0] if "contact" not in record.keys(): record["contact"] = "" if "publisher" in record.keys(): if isinstance(record["publisher"], list): record["publisher"] = record["publisher"][0] if record["publisher"] is not None: contact_address = re.search( r"[\w\.-]+@([\w-]+\.)+[\w-]{2,4}", record["publisher"]) try: record["contact"] = contact_address.group(0) except: pass if isinstance(record["contact"], list): record["contact"] = record["contact"][0] if "series" not in record.keys(): record["series"] = "" # DSpace workaround to exclude theses and non-data content if self.prune_non_dataset_items: if record["type"] and "Dataset" not in record["type"]: return None # EPrints workaround to fix duplicates and Nones in Rights if "rights" in record.keys() and isinstance(record["rights"], list): record["rights"] = list(set(filter(None.__ne__, record["rights"]))) # EPrints workaround for liberal use of dc:identifier # Rather not hardcode a single source URL for this if self.url == "http://spectrum.library.concordia.ca/cgi/oai2": for relation in record["relation"]: if "http://spectrum.library.concordia.ca" in relation: record["dc:source"] = relation return record def find_domain_metadata(self, record): newRecord = {} for elementName in list(record.keys()): if '#' in elementName: newRecord[elementName] = record.pop(elementName, None) return newRecord def _rate_limited(max_per_second): """ Decorator that make functions not be called faster than a set rate """ threading = __import__('threading') lock = threading.Lock() min_interval = 1.0 / float(max_per_second) def decorate(func): last_time_called = [0.0] @wraps(func) def rate_limited_function(*args, **kwargs): lock.acquire() elapsed = time.clock() - last_time_called[0] left_to_wait = min_interval - elapsed if left_to_wait > 0: time.sleep(left_to_wait) lock.release() ret = func(*args, **kwargs) last_time_called[0] = time.clock() return ret return rate_limited_function return decorate @_rate_limited(5) def _update_record(self, record): self.logger.debug("Updating OAI record {}".format( record['local_identifier'])) try: single_record = self.sickle.GetRecord( identifier=record["local_identifier"], metadataPrefix=self.metadataprefix) try: metadata = single_record.metadata if 'identifier' in metadata.keys() and isinstance( metadata['identifier'], list): if "http" in metadata['identifier'][0].lower(): metadata['dc:source'] = metadata['identifier'][0] except AttributeError: metadata = {} # EPrints workaround for using header datestamp in lieu of date if 'date' not in metadata.keys( ) and single_record.header.datestamp: metadata["date"] = single_record.header.datestamp metadata['identifier'] = single_record.header.identifier oai_record = self.unpack_oai_metadata(metadata) domain_metadata = self.find_domain_metadata(metadata) if oai_record is None: self.db.delete_record(record) return False self.db.write_record(oai_record, self.repository_id, self.metadataprefix.lower(), domain_metadata) return True except IdDoesNotExist: # Item no longer in this repo self.db.delete_record(record) return True except Exception as e: self.logger.error( "Updating item failed (repo_id:{}, oai_id:{}): {}".format( self.repository_id, record['local_identifier'], e)) # Touch the record so we do not keep requesting it on every run self.db.touch_record(record) self.error_count = self.error_count + 1 if self.error_count < self.abort_after_numerrors: return True return False
class OAIHarvester(object): """Downloads files from a OAI-PMH 2.0 API and stores them as xml.""" def __init__(self, base_url: str, metadata_prefix: str, path: str, base_file_name='harvest-result', user='', password='', logger=logging.getLogger('oai'), encoding='iso-8859-1'): """ Configure a basic connection to the OAI-Server. Sets up the sickle instance with appropriate settings and checks if the metadata prefix is valid. Creates a directory at path if no such path exists. :param base_url: Base url for the oai request without http:// :param metadata_prefix: Metadata-Prefix for the api_response to be harvested. :param path: Directory path where the files should be stored. :param base_file_name: Downloads are saved in this file. If several downloads are made the resumption token or a random number is added. :param user: User name for basic http authentication (unescaped) :param password: Password for basic http authentication (unescaped) :param logger: Logger used to log all actions and errors of this class. :param encoding: The encoding used to store elements :raises InvalidPrefixError if the given prefix is not valid. """ self.encoding = encoding self.logger = logger self.use_authentication = False if user != '': assert password != '' self.user = urllib.parse.quote(user) self.encoded_password = urllib.parse.quote(password) self.use_authentication = True self.logger.info('Uses authentication with credentials: user: %s, password: %s.', self.user, self.encoded_password) else: self.logger.info('No authentication given.') self.url = base_url self.path = path self.base_file_name = base_file_name self.metadataPrefix = metadata_prefix self.api_response = None self.data = list() if self.use_authentication: self.sickle = Sickle('https://' + self.user + ':' + self.encoded_password + '@' + self.url, iterator=OAIResponseIterator) else: self.sickle = Sickle('https://' + self.url, iterator=OAIResponseIterator) self._verify_metadata_prefix() if not os.path.exists(self.path): self.logger.info('Create directory at %s.', self.path) os.makedirs(self.path) def _verify_metadata_prefix(self): """ Verifies that the used metadata prefix is valid for this OAI repository. :raises InvalidPrefixError if the given prefix is not valid. """ # changes the sickle iterator to item to easily access metadata prefix. self.sickle.iterator = OAIItemIterator valid_prefix_list = list() metadata = self.sickle.ListMetadataFormats() is_valid_prefix = False while True: try: prefix = metadata.next().metadataPrefix except StopIteration: break valid_prefix_list.append(prefix) if prefix == self.metadataPrefix: is_valid_prefix = True if not is_valid_prefix: self.logger.critical('Given metadata prefix (%s) was not valid. Select one of these: %s', self.metadataPrefix, str(valid_prefix_list)) raise InvalidPrefixError('Invalid metadataPrefix: ' + self.metadataPrefix + '.\n' + ' A list of the available prefixes: ' + str(valid_prefix_list)) else: self.logger.info('The prefix given is valid.') def store_records(self, set_id=None, date=None, ignore_deleted=False): """ Downloads all records found on the OAI-API or all records from a given set. :param set_id: determine what set to download if a given set should be downloaded (default None) :type set_id: str :param date: Only records added/changed after this date will be downloaded (default None) :type date: str 'YYYY-MM-DD' :param ignore_deleted: When true ignores all deleted records. This may not be a feature available in all OAI archives. :type ignore_deleted bool """ self.sickle.iterator = OAIResponseIterator params = {'metadataPrefix': self.metadataPrefix, 'from': date, 'set': set_id, 'ignore_deleted': ignore_deleted} self.api_response = self.sickle.ListRecords(**params) self._write_all_records() def store_record(self, identifier: int): """ Downloads a single record with the given id and stores it in a file at the given place. :param identifier: the id which should be retrieved. """ self.sickle.iterator = OAIResponseIterator record = self.sickle.GetRecord(identifier=identifier, metadataPrefix=self.metadataPrefix) temp_xml = record.raw with open(self.path + self.base_file_name + str(identifier) + '.xml', 'w', encoding=self.encoding) as file: file.write(temp_xml) def iterate_sets(self): """Iterate through all sets available at the OAI repository. :return List of all sets as tupels (id, name) :rtype: iterator tuple (str, str) """ self.sickle.iterator = OAIItemIterator try: sets = self.sickle.ListSets() for s in sets: yield (s.setSpec, s.setName) except NoSetHierarchy as error: self.logger.warning(str(error)) raise NoSetHierarchy(error) def _write_all_records(self): """Writes all downloaded api_response into xml files.""" if self.api_response is None: self.logger.critical('No response loaded.') raise Exception('No response loaded.') record = self.api_response.next() last_count = 0 while record: temp_xml = record.raw if isinstance(temp_xml, str): root = ElementTree.fromstring(temp_xml) self.data.append(root) download_count = len(root[2]) - 1 last_count += download_count token = root[2][-1] total = 0 file = None try: file = open(self.path + self.base_file_name + '-' + token.text + '.xml', 'w', encoding=self.encoding) total = int(root[2][-1].get('completeListSize')) self.logger.info('Downloaded %s records from repository. Still %s to go.', download_count, total - last_count) file.write(temp_xml) record = self.api_response.next() except TypeError: # no resumption token found. file = open(self.path + self.base_file_name + '-' + str(random.randrange(100000)) + '.xml', 'w', encoding=self.encoding) self.logger.info('No resumption token found. Stopping Download. ' 'Downloaded %s from this repository.', total) file.write(temp_xml) record = None except (BadArgument, BadResumptionToken) as error: self.logger.critical('Stopped Download: "%s"', str(error)) record = None finally: if file is not None: file.close()
from sickle import Sickle from sickle.oaiexceptions import IdDoesNotExist from constants import oai_url, oai_id from downloader import download sickle = Sickle(oai_url) counter = 0 skip = 0 while True: counter += 1 oaid = oai_id + str(counter) try: record = sickle.GetRecord(identifier=oaid, metadataPrefix='oai_dc') if download(str(counter), str(record)): skip = 0 else: skip += 1 except IdDoesNotExist: skip += 1 if skip > 500: break print("Finished")
def do_import(max_num): x = 0 bit = 100 / max_num print(' [*] connecting to database...') db = database.get_db() print(' [*] connecting to the OAI-PMH server...') sickle = Sickle('https://web.e.toscana.it/SebinaOpac/OAIHandler') print(' [*] fetching records with prefix `oai_dc` (Dublin Core)...') records = sickle.ListRecords(metadataPrefix='oai_dc') count = 0 array = [] places = [] db.execute('delete from biblios') db.execute('delete from records') db.execute('delete from places') db.execute("delete from sqlite_sequence where name='records'") query = "INSERT INTO records(title, subject, creator, contributor, date, description, language, publisher, type, format, relation, published_in, link, biblio)" \ "VALUES(?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?, ?)" query2 = "INSERT INTO places(id, name, coords) VALUES(?, ?, ?)" print(' [*] parsing first {} records...'.format(max_num)) for record in records: if count < max_num: # Estrai luogo di pubblicazione dallo stesso record ma in formato Unimarc unimarc = sickle.GetRecord(identifier=record.header.identifier, metadataPrefix='oai_unimarc') luogo = luogo_pubblicazione(unimarc) place_id = None if luogo is not None: coords = get_coordinates(luogo) # Hardcoded coordinates if luogo == 'Accademia dei Georgofili': coords = '43.7685119,11.255005' place_id = '6569185' elif luogo == 'Massa': coords = '44.033333,10.133333' place_id = '875754' elif luogo == 'Castel S. Niccolò': coords = '43.7192741,11.5975257' place_id = '31541' elif luogo == 'Porcari': coords = '43.8419546,10.6008321' place_id = '32388' elif luogo == 'Cascina': coords = '43.6877668,10.4729074' place_id = '34342' elif luogo == 'San Vincenzo': coords = '43.100134,10.540344' place_id = '32495' elif luogo == 'Monte Oriolo, Impruneta': coords = '43.70869,11.25515' place_id = '18487140' if coords != '': if place_id is None: place_id = get_page_id(luogo) if place_id is not None: if not any(x[0] == place_id for x in places): places.append((str(place_id), luogo, coords)) # Increment counters count += 1 x += bit # Inserisci nella query d = record2dict(record, place_id) array.append( (d['title'].strip(), d['subject'], d['creator'], d['contributor'], d['date'], d['description'], d['language'], d['publisher'], d['type'], d['format'], d['relation'], d['published_in'], d['link'], d['biblio'])) else: print('could not find page id for ' + luogo + ', skipping...') else: print('could not find coordinates for ' + luogo + ', skipping...') desc = "Importing records... ({}/{})".format(count, max_num) yield "data: {}%%{}\n\n".format(str(x), desc) else: print(' [*] closing source...') yield "data: {}%%{}\n\n".format('100', 'done') break print(' [*] inserting saved records to the table...') # Inserisci nel db db.executemany(query, array) db.executemany(query2, places) db.commit() print(' [*] done!')