def testUpdateRecordWhileSendingData(self): batchSize = 3 oaiJazz = OaiJazz(join(self.tempdir, 'oai')) oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="") storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) self._addOaiRecords(storageComponent, oaiJazz, count=batchSize + 10) dna = be((Observable(), ( OaiPmh(repositoryName='test', adminEmail='*****@*****.**', batchSize=batchSize), (storageComponent, ), (oaiJazz, ), ))) kwargs = dict( Method='GET', Headers={'Host': 'myserver'}, port=1234, path='/oaipmh.pl', arguments=dict(verb=['ListIdentifiers'], metadataPrefix=['prefix']), ) stream = compose(dna.all.handleRequest(**kwargs)) buf = StringIO() for stuff in stream: buf.write(stuff) if 'identifier>id0<' in stuff: oaiJazz.addOaiRecord(identifier="id1", metadataPrefixes=["prefix"]) result = XML(buf.getvalue().split(CRLF * 2)[-1].encode()) resumptionToken = xpathFirst( result, '/oai:OAI-PMH/oai:ListIdentifiers/oai:resumptionToken/text()') self.assertFalse(resumptionToken is None)
def testNearRealtimeOai(self): self.run = True portNumber = randint(50000, 60000) suspendRegister = SuspendRegister() oaiJazz = OaiJazz(join(self.tempdir, 'oai')) oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="") oaiJazz.addObserver(suspendRegister) storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) self._addOaiRecords(storageComponent, oaiJazz, 3) oaiPmhThread = Thread( None, lambda: self.startOaiPmh(portNumber, oaiJazz, storageComponent, suspendRegister)) observer = CallTrace("observer", ignoredAttributes=["observer_init"], methods={'add': lambda **kwargs: (x for x in [])}) harvestThread = Thread( None, lambda: self.startOaiHarvester(portNumber, observer)) oaiPmhThread.start() harvestThread.start() try: requests = 3 sleepWheel(1.0 + 1.0 * requests) self.assertEqual([ 'startOaiBatch', 'add', 'add', 'stopOaiBatch', 'startOaiBatch', 'add', 'stopOaiBatch' ], [m.name for m in observer.calledMethods]) ids = [ xpath(m.kwargs['lxmlNode'], '//oai:header/oai:identifier/text()') for m in observer.calledMethods if m.name == 'add' ] self.assertEqual([['id0'], ['id1'], ['id2']], ids) self.assertEqual(1, len(suspendRegister)) observer.calledMethods.reset() requests += 1 storageComponent.addData(identifier="id3", name="prefix", data=b"<a>a3</a>") oaiJazz.addOaiRecord(identifier="id3", metadataPrefixes=["prefix"]) sleepWheel(1) self.assertEqual(0, len(suspendRegister)) self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertTrue("id3" in kwarg, kwarg) sleepWheel(1.0) self.assertEqual(1, len(suspendRegister)) finally: self.run = False oaiPmhThread.join() harvestThread.join() oaiJazz.close()
def testNearRealtimeOaiSavesState(self): observer = CallTrace("observer", ignoredAttributes=["observer_init"], methods={'add': lambda **kwargs: (x for x in [])}) oaiJazz = OaiJazz(join(self.tempdir, 'oai')) oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="") suspendRegister = SuspendRegister() oaiJazz.addObserver(suspendRegister) storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) self._addOaiRecords(storageComponent, oaiJazz, 1) oaiPmhThread = None harvestThread = None def start(): global oaiPmhThread, harvestThread self.run = True portNumber = randint(50000, 60000) oaiPmhThread = Thread( None, lambda: self.startOaiPmh( portNumber, oaiJazz, storageComponent, suspendRegister)) harvestThread = Thread( None, lambda: self.startOaiHarvester(portNumber, observer)) oaiPmhThread.start() harvestThread.start() def stop(): global oaiPmhThread, harvestThread self.run = False oaiPmhThread.join() oaiPmhThread = None harvestThread.join() harvestThread = None start() requests = 1 sleepWheel(1.0 + 1.0 * requests) self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertTrue("id0" in kwarg, kwarg) stop() observer.calledMethods.reset() storageComponent.addData(identifier="id1", name="prefix", data=b"<a>a1</a>") oaiJazz.addOaiRecord(identifier="id1", metadataPrefixes=["prefix"]) start() requests = 1 sleepWheel(1.0 + 1.0 * requests) self.assertEqual(['startOaiBatch', 'add', 'stopOaiBatch'], [m.name for m in observer.calledMethods]) kwarg = lxmltostring(observer.calledMethods[1].kwargs['lxmlNode']) self.assertFalse("id0" in kwarg, kwarg) self.assertTrue("id1" in kwarg, kwarg) stop()
def main(reactor, port, directory): dumpdir = join(directory, 'dump') isdir(dumpdir) or makedirs(dumpdir) dump = Dump(dumpdir) oaiStorage = MultiSequentialStorage(join(directory, 'storage')) oaiJazz = OaiJazz(join(directory, 'oai')) server = be( (Observable(), (ObservableHttpServer(reactor, port), (PathFilter("/dump"), (dump, )), (PathFilter("/control"), ( Control(), (dump, ), (Log(), ), )), (PathFilter('/oai'), (Log(), ( OaiPmh(repositoryName="Oai Test Server", adminEmail="*****@*****.**", batchSize=10), (oaiStorage, ), (oaiJazz, ), ))), (PathFilter('/badoai'), (Log(), (BadOai(), ))), (PathFilter("/log"), (RetrieveLog(), (Log(), ))), (PathFilter("/ready"), (StringServer('yes', ContentTypePlainText), ))))) list(compose(server.once.observer_init())) oaiJazz.updateMetadataFormat( prefix="oai_dc", schema="http://www.openarchives.org/OAI/2.0/oai_dc.xsd", namespace="http://www.openarchives.org/OAI/2.0/oai_dc/") for i in range(1, 16): if i == 2: identifier = 'oai:record:02/&gkn' else: identifier = 'oai:record:%02d' % i oaiStorage.addData( identifier=identifier, name='oai_dc', data=bytes( '''<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"><dc:identifier>%s</dc:identifier><dc:title>Title is √</dc:title></oai_dc:dc>''' % escapeXml(identifier), encoding='utf-8')) oaiJazz.addOaiRecord(identifier=identifier, metadataPrefixes=['oai_dc']) if i in [3, 6]: list(compose(oaiJazz.delete(identifier=identifier)))
def testGetRecordWithMultiSequentialStorage(self): oaijazz = OaiJazz(self.tempdir + '/jazz') oaijazz.updateMetadataFormat(prefix="oai_dc", schema="", namespace="") storage = MultiSequentialStorage(self.tempdir + "/seq-store") oairecord = OaiRecord() oaigetrecord = be( (OaiGetRecord(repository=OaiRepository()), (oaijazz, ), (oairecord, (RetrieveToGetDataAdapter(), (storage, ))))) oaijazz.addOaiRecord(identifier="id0", metadataPrefixes=['oai_dc']) storage.addData(identifier="id0", name="oai_dc", data=b"data01") response = oaigetrecord.getRecord(arguments=dict( verb=['GetRecord'], metadataPrefix=['oai_dc'], identifier=['id0'], ), **self.httpkwargs) _, body = asString(response).split("\r\n\r\n") self.assertEqual( "data01", xpath(parse(BytesIO(body.encode())), '//oai:metadata')[0].text)
def testRemoveSet(self): oaiJazz = OaiJazz(self.tempdir) oaiJazz.updateSet('a:b', 'set A/B') oaiJazz.updateSet('a:c', 'set A/C') oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="") oaiJazz.addOaiRecord('id:0', setSpecs=['a:b', 'a:c'], metadataPrefixes=['prefix']) oaiJazz.addOaiRecord('id:1', setSpecs=['a:b'], metadataPrefixes=['prefix']) oaiJazz.addOaiRecord('id:2', setSpecs=['a:c'], metadataPrefixes=['prefix']) self.assertEqual([ ('id:0', set(['a', 'a:b', 'a:c']), False), ('id:1', set(['a', 'a:b']), False), ('id:2', set(['a', 'a:c']), False), ], [(r.identifier, r.sets, r.isDeleted) for r in oaiJazz.oaiSelect(prefix='prefix').records]) self.assertEqual(set(['a:b', 'a', 'a:c']), oaiJazz.getAllSets()) oaiJazz.close() removeSetsFromOai(self.tempdir, sets=['a:b'], prefix='prefix', batchSize=1) oaiJazz = OaiJazz(self.tempdir) self.assertEqual([ ('id:2', set(['a', 'a:c']), False), ('id:0', set(['a', 'a:c']), False), ('id:1', set([]), False), ], [(r.identifier, r.sets, r.isDeleted) for r in oaiJazz.oaiSelect(prefix='prefix').records]) self.assertEqual(set(['a', 'a:c']), oaiJazz.getAllSets())
def testShouldRaiseExceptionOnSameRequestTwice(self): self.run = True portNumber = randint(50000, 60000) oaiJazz = OaiJazz(join(self.tempdir, 'oai')) oaiJazz.updateMetadataFormat(prefix="prefix", schema="", namespace="") suspendRegister = SuspendRegister() oaiJazz.addObserver(suspendRegister) storageComponent = MultiSequentialStorage(join(self.tempdir, 'storage')) clientId = str(uuid4()) responses = [] def doOaiListRecord(port): header, body = getRequest(port=portNumber, path="/", arguments={ "verb": "ListRecords", "metadataPrefix": "prefix", "x-wait": "True" }, additionalHeaders={ 'X-Meresco-Oai-Client-Identifier': clientId }, parse=False) responses.append((header, body)) oaiPmhThread = Thread( None, lambda: self.startOaiPmh(portNumber, oaiJazz, storageComponent, suspendRegister)) harvestThread1 = Thread(None, lambda: doOaiListRecord(portNumber)) harvestThread2 = Thread(None, lambda: doOaiListRecord(portNumber)) with stderr_replaced(): oaiPmhThread.start() harvestThread1.start() try: while len(suspendRegister) == 0: sleep(0.01) harvest1Suspend = suspendRegister._suspendObject(clientId) self.assertTrue(harvest1Suspend is not None) harvestThread2.start() while harvest1Suspend == suspendRegister._suspendObject( clientId): sleep(0.01) sleep(0.01) self.assertTrue(clientId in suspendRegister) self.assertTrue( harvest1Suspend != suspendRegister._suspendObject(clientId) ) self.assertEqual(1, len(responses)) statusAndHeader, body = responses[0] self.assertEqual("204", statusAndHeader['StatusCode']) self.assertTrue(body.startswith(b'Aborting suspended request'), body) storageComponent.addData(identifier="id1", name="prefix", data=b"<a>a1</a>") oaiJazz.addOaiRecord(identifier="id1", metadataPrefixes=["prefix"]) sleep(0.1) finally: self.run = False oaiPmhThread.join() harvestThread1.join() harvestThread2.join() oaiJazz.close()
def main(reactor, port, statePath, gatewayPort, quickCommit=False, **ignored): strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[ NL_DIDL_NORMALISED_PREFIX, NL_DIDL_COMBINED_PREFIX, 'metadata' ]) oaiJazz = OaiJazz(join(statePath, 'oai')) oaiJazz.updateMetadataFormat("metadata", "http://didl.loc.nl/didl.xsd", NAMESPACEMAP.didl) oaiJazz.updateMetadataFormat(NL_DIDL_COMBINED_PREFIX, "", NAMESPACEMAP.gmhcombined) oaiJazz.updateMetadataFormat(NL_DIDL_NORMALISED_PREFIX, "", NAMESPACEMAP.gmhnorm) normLogger = Logger(join(statePath, '..', 'gateway', 'normlogger')) periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule( period=.1 if quickCommit else 10 ), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need <=1 second! Otherwise tests will fail! name='api', autoStart=True) oaiDownload = OaiDownloadProcessor(path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join( statePath, 'harvesterstate', 'gateway'), userAgentAddition='ApiServer', xWait=True, name='api', autoCommit=False) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter('/oai'), (OaiPmh( repositoryName="Gemeenschappelijke Metadata Harvester DANS-KB", adminEmail="*****@*****.**", externalUrl="http://oai.gharvester.dans.knaw.nl", batchSize=200, supportXWait=False, # preciseDatestamp=False, # deleteInSets=False ), (oaiJazz, ), (RetrieveToGetDataAdapter(), (storage,), ), (OaiBranding( url="https://www.narcis.nl/images/logos/logo-knaw-house.gif", #TODO: Link to a joint-GMH icon... link="https://harvester.dans.knaw.nl", title="Gemeenschappelijke Metadata Harvester (GMH) van DANS en de KB"), ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:metadataPrefix/text()'), #TODO: Kan hardcoded in harvester mapper gezet eventueel: <metadataNamespace>urn:mpeg:mpeg21:2002:01-DII-NS</metadataNamespace>?? (storage,) #metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (RetrieveToGetDataAdapter(), (storage,), ) ) ) ), (PathFilter('/rss'), (LoggerRSS( title = 'GMH DANS-KB Normalisationlog Syndication', description = 'Harvester normalisation log for: ', link = 'http://rss.gharvester.dans.knaw.nl/rss', maximumRecords = 30), (normLogger, (storage,) ) ) ), (PathFilter('/xls'), # (LogComponent("XLS-Request:"),), (XlsServer(),) ) ) ) )
def main(reactor, port, statePath, lucenePort, gatewayPort, quickCommit=False, **ignored): ######## START Lucene Integration ############################################################### defaultLuceneSettings = LuceneSettings( commitTimeout=30, readonly=True, ) http11Request = be(( HttpRequest1_1(), (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)), ), )) luceneIndex = luceneAndReaderConfig( defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort) luceneRoHelix = be( (AdapterToLuceneQuery(defaultCore=DEFAULT_CORE, coreConverters={ DEFAULT_CORE: QueryExpressionToLuceneQueryDict( UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings), }), ( MultiLucene(host='localhost', port=lucenePort, defaultCore=DEFAULT_CORE), (luceneIndex, ), (http11Request, ), ))) ######## END Lucene Integration ############################################################### fieldnameRewrites = { # UNTOKENIZED_PREFIX+'genre': UNTOKENIZED_PREFIX+'dc:genre', } def fieldnameRewrite(name): return fieldnameRewrites.get(name, name) def drilldownFieldnamesTranslate(fieldname): untokenizedName = untokenizedFieldname(fieldname) if untokenizedName in untokenizedFieldnames: fieldname = untokenizedName return fieldnameRewrite(fieldname) convertToComposedQuery = ConvertToComposedQuery( resultsFrom=DEFAULT_CORE, matches=[], drilldownFieldnamesTranslate=drilldownFieldnamesTranslate) strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[ HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME, OPENAIRE_PARTNAME ]) oaiJazz = OaiJazz(join(statePath, 'oai')) oaiJazz.updateMetadataFormat( OAI_DC_PARTNAME, "http://www.openarchives.org/OAI/2.0/oai_dc.xsd", "http://purl.org/dc/elements/1.1/") oai_oa_cerifJazz = OaiJazz(join(statePath, 'oai_cerif')) oai_oa_cerifJazz.updateMetadataFormat( OPENAIRE_PARTNAME, "https://www.openaire.eu/schema/cris/current/openaire-cerif-profile.xsd", "https://www.openaire.eu/cerif-profile/1.1/") # All of the following OAI-PMH sets shall be recognized by the CRIS, even if not all of them are populated. oai_oa_cerifJazz.updateSet("openaire_cris_projects", "OpenAIRE_CRIS_projects") oai_oa_cerifJazz.updateSet("openaire_cris_orgunits", "OpenAIRE_CRIS_orgunits") oai_oa_cerifJazz.updateSet("openaire_cris_persons", "OpenAIRE_CRIS_persons") oai_oa_cerifJazz.updateSet("openaire_cris_patents", "OpenAIRE_CRIS_patents") oai_oa_cerifJazz.updateSet("openaire_cris_products", "OpenAIRE_CRIS_products") oai_oa_cerifJazz.updateSet("openaire_cris_publications", "OpenAIRE_CRIS_publications") oai_oa_cerifJazz.updateSet("openaire_cris_funding", "OpenAIRE_CRIS_funding") oai_oa_cerifJazz.updateSet("openaire_cris_events", "OpenAIRE_CRIS_events") oai_oa_cerifJazz.updateSet("openaire_cris_equipments", "OpenAIRE_CRIS_equipments") cqlClauseConverters = [ RenameFieldForExact( untokenizedFields=untokenizedFieldnames, untokenizedPrefix=UNTOKENIZED_PREFIX, ).filterAndModifier(), SearchTermFilterAndModifier( shouldModifyFieldValue=lambda *args: True, fieldnameModifier=fieldnameRewrite).filterAndModifier(), ] periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule( period=1 if quickCommit else 10 ), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail! name='api', autoStart=True) oaiDownload = OaiDownloadProcessor(path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join( statePath, 'harvesterstate', 'gateway'), userAgentAddition='ApiServer', xWait=True, name='api', autoCommit=False) executeQueryHelix = \ (FilterMessages(allowed=['executeQuery']), (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'), (DrilldownQueries(), (convertToComposedQuery, (luceneRoHelix,), ) ) ) ) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz, oai_oa_cerifJazz), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter(["/oai"]), (OaiPmh(repositoryName="NARCIS OAI-pmh", adminEmail="*****@*****.**", externalUrl="http://oai.narcis.nl"), (oaiJazz,), (StorageAdapter(), (storage,) ), (OaiBranding( url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", link="http://oai.narcis.nl", title="Narcis - The gateway to scholarly information in The Netherlands"), ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:record/meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (storage,) ) ) ), (PathFilter(["/cerif"]), (OaiPmhDans(repositoryName="OpenAIRE CERIF", adminEmail="*****@*****.**", repositoryIdentifier="services.nod.dans.knaw.nl", externalUrl="http://services.nod.dans.knaw.nl"), #TODO: pathFilter should resemble proxy path (oai_oa_cerifJazz,), (StorageAdapter(), (storage,) ), (OaiOpenAIREDescription( serviceid='organisation:ORG1242054', acronym='services.nod.dans.knaw.nl', name='NARCIS', description='Compliant with the OpenAIRE Guidelines for CRIS Managers v.1.1.', website='https://www.narcis.nl', baseurl='http://services.nod.dans.knaw.nl/oa-cerif', subjectheading='', orgunitid='organisation:ORG1242054', owneracronym='DANS'), ), # (OaiBranding( # url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", # link="http://oai.narcis.nl", # title="Narcis - The gateway to scholarly information in The Netherlands"), # ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:record/meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (storage,) ) ) ), (PathFilter(['/sru']), (SruParser( host='sru.narcis.nl', port=80, defaultRecordSchema='knaw_short', defaultRecordPacking='xml'), (SruLimitStartRecord(limitBeyond=4000), (SruHandler( includeQueryTimes=False, extraXParameters=[], enableCollectLog=False), (SruTermDrilldown(),), executeQueryHelix, (StorageAdapter(), (storage,) ) ) ) ) ), (PathFilter('/rss'), (Rss( supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied. title = {'nl':'NARCIS', 'en':'NARCIS'}, description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'}, link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'}, maximumRecords = 20), executeQueryHelix, (RssItem( nsMap=NAMESPACEMAP, title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}), description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}), pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'), linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s', wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'), oai_identifier = ('meta', '//meta:record/meta:id/text()'), language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.') ), (StorageAdapter(), (storage,) ) ) ) ) ) ) )
def testExport(self): jazz = OaiJazz(join(self.tempdir, 'oai'), deleteInSets=True) jazz.updateMetadataFormat(prefix='someprefix', schema='https://example.org/schema.xsd', namespace='urn:ns') jazz.updateMetadataFormat(prefix='prefix', schema='schema', namespace='namespace') jazz.updateSet(setSpec='a', setName='A') jazz.updateSet(setSpec='setSpec', setName='setName') jazz.addOaiRecord(identifier='id:0', metadataPrefixes=['prefix']) jazz.addOaiRecord(identifier='id:1', metadataPrefixes=['prefix'], setSpecs=['a', 'a:b', 'd:e:f']) jazz.addOaiRecord(identifier='id:2', metadataPrefixes=['prefix', 'someprefix'], setSpecs=['a', 'a:b', 'd:e:f']) jazz.addOaiRecord(identifier='id:3', metadataPrefixes=['prefix', 'someprefix'], setSpecs=['a', 'a:b', 'd:e:f']) for i in range(4, 3000): jazz.addOaiRecord(identifier='id:{}'.format(i), metadataPrefixes=['prefix']) jazz.deleteOaiRecordInPrefixes(identifier='id:2', metadataPrefixes=['someprefix']) jazz.deleteOaiRecordInSets(identifier='id:3', setSpecs=['d:e:f']) jazz.deleteOaiRecord(identifier='id:7') dumpfile = join(self.tempdir, 'dump') jazz.export(dumpfile) with open(dumpfile) as fp: d = fp.readlines() self.assertEqual(3003, len(d)) self.assertEqual('META:\n', d[0]) self.assertEqual('RECORDS:\n', d[2]) meta = loads(d[1].strip()) self.assertEqual( { 'export_version': 1, 'metadataPrefixes': { 'someprefix': { 'schema': 'https://example.org/schema.xsd', 'namespace': 'urn:ns' }, 'prefix': { 'schema': 'schema', 'namespace': 'namespace' }, }, 'sets': { 'a': { 'setName': 'A' }, 'a:b': { 'setName': '' }, 'd': { 'setName': '' }, 'd:e': { 'setName': '' }, 'd:e:f': { 'setName': '' }, 'setSpec': { 'setName': 'setName' }, } }, meta) record0 = loads(d[3].strip()) self.assertAlmostEqual(time(), record0['timestamp'] / 10.0**6, delta=3) record0['timestamp'] = 'TIMESTAMP' self.assertEqual( { 'identifier': 'id:0', 'timestamp': 'TIMESTAMP', 'tombstone': False, 'deletedPrefixes': [], 'prefixes': ['prefix'], 'deletedSets': [], 'sets': [], }, record0) record2 = loads(d[-3].strip()) record2['timestamp'] = 'TIMESTAMP' self.assertEqual( { 'identifier': 'id:2', 'timestamp': 'TIMESTAMP', 'tombstone': False, 'deletedPrefixes': ['someprefix'], 'prefixes': ['prefix', 'someprefix'], 'deletedSets': [], 'sets': ['a', 'a:b', 'd', 'd:e', 'd:e:f'], }, record2) record3 = loads(d[-2].strip()) record3['timestamp'] = 'TIMESTAMP' self.assertEqual( { 'identifier': 'id:3', 'timestamp': 'TIMESTAMP', 'tombstone': False, 'deletedPrefixes': [], 'prefixes': ['prefix', 'someprefix'], 'deletedSets': ['d:e:f'], 'sets': ['a', 'a:b', 'd', 'd:e', 'd:e:f'], }, record3) record7 = loads(d[-1].strip()) record7['timestamp'] = 'TIMESTAMP' self.assertEqual( { 'identifier': 'id:7', 'timestamp': 'TIMESTAMP', 'tombstone': True, 'deletedPrefixes': ['prefix'], 'prefixes': ['prefix'], 'deletedSets': [], 'sets': [], }, record7)
class OaiInfoTest(SeecrTestCase): def setUp(self): super(OaiInfoTest, self).setUp() self.oaiInfo = OaiInfo(reactor=CallTrace(), oaiPath='/') self.jazz = OaiJazz(self.tempdir) self.top = be((Observable(), (self.oaiInfo, (self.jazz, )))) self.jazz.updateSet(setSpec="set1", setName="set1") self.jazz.updateSet(setSpec="set2", setName="set name 2") self.jazz.updateMetadataFormat(prefix="prefix1", schema="", namespace="") self.jazz.updateMetadataFormat(prefix="oai", schema="oai-schema", namespace="oai-namespace") self.jazz.addOaiRecord(identifier='record1', setSpecs=['set1'], metadataPrefixes=['prefix1']) self.jazz.addOaiRecord(identifier='record2', setSpecs=['set1'], metadataPrefixes=['prefix1', 'oai']) self.jazz.addOaiRecord(identifier='record3', setSpecs=['set1', 'set2'], metadataPrefixes=['prefix1']) consume(self.jazz.delete(identifier='record3')) self.jazz.commit() def testInfo(self): result = asString( self.top.all.handleRequest(path='/info/json', arguments={})) header, body = result.split('\r\n\r\n') lastStamp = self.jazz.getLastStampId(prefix=None) self.assertTrue(lastStamp != None) self.assertEqual( { 'totalRecords': { 'total': 3, 'deletes': 1 }, 'lastStamp': lastStamp }, loads(body)) def testGetAllSets(self): result = asString( self.top.all.handleRequest(path='/info/json/sets', arguments={})) header, body = result.split('\r\n\r\n') self.assertEqual(['set1', 'set2'], loads(body)) def testGetAllPrefixes(self): result = asString( self.top.all.handleRequest(path='/info/json/prefixes', arguments={})) header, body = result.split('\r\n\r\n') self.assertEqual(['oai', 'prefix1'], loads(body)) def testPrefixInfo(self): result = asString( self.top.all.handleRequest(path='/info/json/prefix', arguments=dict(prefix=['prefix1']))) header, body = result.split('\r\n\r\n') lastStamp = self.jazz.getLastStampId(prefix='prefix1') self.assertTrue(lastStamp != None) self.assertEqual( dict(prefix='prefix1', schema='', namespace='', nrOfRecords=dict(total=3, deletes=1), lastStamp=lastStamp), loads(body)) result = asString( self.top.all.handleRequest(path='/info/json/prefix', arguments=dict(prefix=['oai']))) header, body = result.split('\r\n\r\n') oaiLastStamp = self.jazz.getLastStampId(prefix='oai') self.assertTrue(oaiLastStamp != None) self.assertTrue(lastStamp != oaiLastStamp) self.assertEqual( dict(prefix='oai', schema='oai-schema', namespace='oai-namespace', nrOfRecords=dict(total=1, deletes=0), lastStamp=oaiLastStamp), loads(body)) def testUnknownPrefixInfo(self): result = asString( self.top.all.handleRequest(path='/info/json/prefix', arguments=dict(prefix=['unknown']))) header, body = result.split('\r\n\r\n') self.assertEqual({}, loads(body)) def testSetInfo(self): result = asString( self.top.all.handleRequest(path='/info/json/set', arguments=dict(set=['set1']))) header, body = result.split('\r\n\r\n') lastStamp = self.jazz.getLastStampId(setSpec='set1', prefix=None) self.assertTrue(lastStamp != None) self.assertEqual( dict(setSpec='set1', name='set1', nrOfRecords=dict(total=3, deletes=1), lastStamp=lastStamp), loads(body)) result = asString( self.top.all.handleRequest(path='/info/json/set', arguments=dict(set=['set2']))) header, body = result.split('\r\n\r\n') set2LastStamp = self.jazz.getLastStampId(setSpec='set2', prefix=None) self.assertTrue(lastStamp == set2LastStamp) self.assertEqual( dict(setSpec='set2', name='set name 2', nrOfRecords=dict(total=1, deletes=1), lastStamp=set2LastStamp), loads(body)) def testResumptionTokenInfo(self): firstRecord = next( self.jazz.oaiSelect(prefix='prefix1', batchSize=1).records) resumptionToken = ResumptionToken(metadataPrefix='prefix1', continueAfter=firstRecord.stamp) result = asString( self.top.all.handleRequest( path='/info/json/resumptiontoken', arguments=dict(resumptionToken=[str(resumptionToken)]))) header, body = result.split('\r\n\r\n') self.assertEqual( { 'prefix': 'prefix1', 'set': None, 'from': None, 'until': None, 'nrOfRecords': { 'total': 3, 'deletes': 1 }, 'nrOfRemainingRecords': { 'total': 2, 'deletes': 1 }, 'timestamp': firstRecord.stamp }, loads(body))
def main(reactor, port, statePath, lucenePort, gatewayPort, quickCommit=False, **ignored): ######## START Lucene Integration ############################################################### defaultLuceneSettings = LuceneSettings( commitTimeout=30, readonly=True,) http11Request = be( (HttpRequest1_1(), (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),), ) ) luceneIndex = luceneAndReaderConfig(defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort) luceneRoHelix = be( (AdapterToLuceneQuery( defaultCore=DEFAULT_CORE, coreConverters={ DEFAULT_CORE: QueryExpressionToLuceneQueryDict(UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings), } ), (MultiLucene(host='localhost', port=lucenePort, defaultCore=DEFAULT_CORE), (luceneIndex,), (http11Request,), ) ) ) ######## END Lucene Integration ############################################################### fieldnameRewrites = { # UNTOKENIZED_PREFIX+'genre': UNTOKENIZED_PREFIX+'dc:genre', } def fieldnameRewrite(name): return fieldnameRewrites.get(name, name) def drilldownFieldnamesTranslate(fieldname): untokenizedName = untokenizedFieldname(fieldname) if untokenizedName in untokenizedFieldnames: fieldname = untokenizedName return fieldnameRewrite(fieldname) convertToComposedQuery = ConvertToComposedQuery( resultsFrom=DEFAULT_CORE, matches=[], drilldownFieldnamesTranslate=drilldownFieldnamesTranslate ) strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME]) oaiJazz = OaiJazz(join(statePath, 'oai')) oaiJazz.updateMetadataFormat(OAI_DC_PARTNAME, "http://www.openarchives.org/OAI/2.0/oai_dc.xsd", "http://purl.org/dc/elements/1.1/") # def updateMetadataFormat(self, prefix, schema, namespace): # oaiJazz.updateMetadataFormat("knaw_long", "http://www.narcis.nl/scheme/knaw_long.xsd", "http://www.knaw.nl/narcis/1.0/long/") # Wat doet dit? cqlClauseConverters = [ RenameFieldForExact( untokenizedFields=untokenizedFieldnames, untokenizedPrefix=UNTOKENIZED_PREFIX, ).filterAndModifier(), SearchTermFilterAndModifier( shouldModifyFieldValue=lambda *args: True, fieldnameModifier=fieldnameRewrite ).filterAndModifier(), ] periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule(period=1 if quickCommit else 10), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail! name='api', autoStart=True) oaiDownload = OaiDownloadProcessor( path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join(statePath, 'harvesterstate', 'gateway'), userAgentAddition='ApiServer', xWait=True, name='api', autoCommit=False) executeQueryHelix = \ (FilterMessages(allowed=['executeQuery']), (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'), (DrilldownQueries(), (convertToComposedQuery, (luceneRoHelix,), ) ) ) ) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter(["/oai"]), (OaiPmh(repositoryName="NARCIS OAI-pmh", adminEmail="*****@*****.**"), (oaiJazz,), (StorageAdapter(), (storage,) ), (OaiBranding( url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", link="http://oai.narcis.nl", title="Narcis - The gateway to scholarly information in The Netherlands"), ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:record/meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (storage,) ) ) ), (PathFilter(['/sru']), (SruParser( host='sru.narcis.nl', port=80, defaultRecordSchema='knaw_short', defaultRecordPacking='xml'), (SruLimitStartRecord(limitBeyond=4000), (SruHandler( includeQueryTimes=False, extraXParameters=[], enableCollectLog=False), (SruTermDrilldown(),), executeQueryHelix, (StorageAdapter(), (storage,) ) ) ) ) ), (PathFilter('/rss'), (Rss( supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied. title = {'nl':'NARCIS', 'en':'NARCIS'}, description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'}, link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'}, maximumRecords = 20), executeQueryHelix, (RssItem( nsMap=NAMESPACEMAP, title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}), description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}), pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'), linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s', wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'), oai_identifier = ('meta', '//meta:record/meta:id/text()'), language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.') ), (StorageAdapter(), (storage,) ) ) ) ) ) ) )
def main(reactor, port, statePath, indexPort, gatewayPort, **ignored): apacheLogStream = sys.stdout periodicDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, name='gateway') oaiDownload = OaiDownloadProcessor( path='/oai', metadataPrefix='oai_dc', workingDirectory=join(statePath, 'harvesterstate', 'gateway'), xWait=True, name='gateway', autoCommit=False) def sortFieldRename(name): if not name.startswith('__'): name = SORTED_PREFIX + name return name fieldnameRewrites = { } def fieldnameRewrite(name): return fieldnameRewrites.get(name, name) def drilldownFieldnamesTranslate(fieldname): untokenizedName = untokenizedFieldname(fieldname) if untokenizedName in untokenizedFieldnames: fieldname = untokenizedName return fieldnameRewrite(fieldname) convertToComposedQuery = ConvertToComposedQuery( resultsFrom=DEFAULT_CORE, matches=[], drilldownFieldnamesTranslate=drilldownFieldnamesTranslate ) luceneRemote = LuceneRemote(host='localhost', port=indexPort, path='/lucene') storage = StorageComponent(join(statePath, 'store')) oaiJazz = OaiJazz(join(statePath, 'oai')) oaiJazz.updateMetadataFormat('oai_dc', None, None) cqlClauseConverters = [ RenameFieldForExact( untokenizedFields=untokenizedFieldnames, untokenizedPrefix=UNTOKENIZED_PREFIX, ).filterAndModifier(), SearchTermFilterAndModifier( shouldModifyFieldValue=lambda *args: True, fieldnameModifier=fieldnameRewrite ).filterAndModifier(), ] scheduledCommitPeriodicCall = be( (PeriodicCall(reactor, message='commit', name='Scheduled commit', initialSchedule=Schedule(period=1), schedule=Schedule(period=1)), (AllToDo(), (storage,), (periodicDownload,), ) ) ) directoryLog = DirectoryLog(join(statePath, 'log'), extension='-query.log') executeQueryHelix = \ (FilterMessages(allowed=['executeQuery']), (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'), (DrilldownQueries(), (convertToComposedQuery, (luceneRemote,), ) ) ), ) return \ (Observable(), (scheduledCommitPeriodicCall,), (DebugPrompt(reactor=reactor, port=port+1, globals=locals()),), createDownloadHelix(reactor, periodicDownload, oaiDownload, storage, oaiJazz), (ObservableHttpServer(reactor, port, compressResponse=True), (LogCollector(), (ApacheLogWriter(apacheLogStream),), (QueryLogWriter.forHttpArguments( log=directoryLog, scopeNames=('http-scope',) ), ), (QueryLogWriter(log=directoryLog, scopeNames=('sru-scope',)),), (Deproxy(), (HandleRequestLog(), (BasicHttpHandler(), (PathFilter(["/oai"]), (LogCollectorScope("http-scope"), (OaiPmh(repositoryName="Example OAI", adminEmail="*****@*****.**"), (oaiJazz,), (StorageAdapter(), (storage,) ), ) ) ), (PathFilter(['/sru']), (LogCollectorScope('sru-scope'), (SruParser( host='example.org', port=80, defaultRecordSchema=DEFAULT_CORE, defaultRecordPacking='xml'), (SruLimitStartRecord(limitBeyond=1000), (SruHandler( includeQueryTimes=True, extraXParameters=[], enableCollectLog=True), (SruTermDrilldown(),), executeQueryHelix, (StorageAdapter(), (storage,) ) ) ) ) ) ), (PathFilter('/rss'), (Rss( title = 'Meresco', description = 'RSS feed for Meresco', link = 'http://meresco.org', maximumRecords = 15), executeQueryHelix, (RssItem( nsMap={ 'dc': "http://purl.org/dc/elements/1.1/", 'oai_dc': "http://www.openarchives.org/OAI/2.0/oai_dc/" }, title = ('oai_dc', '/oai_dc:dc/dc:title/text()'), description = ('oai_dc', '/oai_dc:dc/dc:description/text()'), linkTemplate = 'http://localhost/sru?operation=searchRetrieve&version=1.2&query=dc:identifier%%3D%(identifier)s', identifier = ('oai_dc', '/oai_dc:dc/dc:identifier/text()')), (StorageAdapter(), (storage,) ) ), ) ), (PathFilter('/log'), (LogFileServer(name="Example Queries", log=directoryLog, basepath='/log'),) ), ), ) ) ) ), )