def testHandleRequest(self): observer = CallTrace('Observer', methods={ 'handleRequest': lambda *a, **kw: (x for x in []) }) s = ObservableHttpServer(CallTrace('Reactor'), 1024) s.addObserver(observer) list( compose( s.handleRequest( RequestURI= 'http://localhost/path?key=value&emptykey#fragment'))) self.assertEquals(1, len(observer.calledMethods)) method = observer.calledMethods[0] self.assertEquals('handleRequest', method.name) self.assertEquals(0, len(method.args)) self.assertEquals(7, len(method.kwargs)) self.assertTrue('arguments' in method.kwargs, method.kwargs) arguments = method.kwargs['arguments'] self.assertEquals(2, len(arguments)) self.assertEquals(['emptykey', 'key'], sorted(arguments.keys())) self.assertEquals(['value'], arguments['key']) self.assertEquals([''], arguments['emptykey'])
def testServerBindAddress(self): reactor = CallTrace() port = PortNumberGenerator.next() server = ObservableHttpServer(reactor, port, bindAddress='127.0.0.1') server.startServer() self.assertEquals(('127.0.0.1', port), server._httpserver._acceptor._sok.getsockname())
def testErrorHandlerRegisteredOnWeightlessHttpServer(self): reactor = CallTrace('Reactor') s = ObservableHttpServer(reactor, 1024, maxConnections=5) s.startServer() acceptor = s._httpserver._acceptor httphandler = acceptor._sinkFactory('sok') errorHandler = httphandler._errorHandler self.assertTrue(errorHandler == s._error)
def testSimpleHandleRequest(self): observer = CallTrace('Observer', methods={'handleRequest': lambda *a, **kw: (x for x in [])}) s = ObservableHttpServer(CallTrace('Reactor'), 1024) s.addObserver(observer) list(compose(s.handleRequest(RequestURI='http://localhost'))) self.assertEquals(1, len(observer.calledMethods)) method = observer.calledMethods[0] self.assertEquals('handleRequest', method.name) self.assertEquals(0, len(method.args)) self.assertEquals(7, len(method.kwargs))
def testSetMaximumConnections(self): reactor = CallTrace('Reactor') s = ObservableHttpServer(reactor, 2048, maxConnections=5) s.startServer() httpserver = s._httpserver self.assertEquals(5, httpserver._maxConnections) s.setMaxConnections(6) acceptor = s._httpserver self.assertEquals(6, httpserver._maxConnections) self.assertEquals(6, httpserver._acceptor._sinkFactory('a sink')._maxConnections)
def testMaxConnectionsErrorHandling(self): observer = CallTrace('Observer', methods={'handleRequest': lambda *a, **kw: (x for x in [])}) reactor = CallTrace('Reactor') s = ObservableHttpServer(reactor, 1024, maxConnections=5) s.addObserver(observer) result = ''.join(s._error(ResponseCode=503, something='bicycle')) self.assertEquals(1, len(observer.calledMethods)) self.assertEquals('logHttpError', observer.calledMethods[0].name) self.assertEquals({'ResponseCode': 503, 'something': 'bicycle'}, observer.calledMethods[0].kwargs) header, body = result.split(CRLF * 2) self.assertTrue(header.startswith('HTTP/1.0 503'), header) self.assertTrue('Service Unavailable' in body, body)
def testSimpleHandleRequest(self): observer = CallTrace('Observer', methods={ 'handleRequest': lambda *a, **kw: (x for x in []) }) s = ObservableHttpServer(CallTrace('Reactor'), 1024) s.addObserver(observer) list(compose(s.handleRequest(RequestURI='http://localhost'))) self.assertEquals(1, len(observer.calledMethods)) method = observer.calledMethods[0] self.assertEquals('handleRequest', method.name) self.assertEquals(0, len(method.args)) self.assertEquals(7, len(method.kwargs))
def testHandleRequest(self): observer = CallTrace('Observer', methods={'handleRequest': lambda *a, **kw: (x for x in [])}) s = ObservableHttpServer(CallTrace('Reactor'), 1024) s.addObserver(observer) list(compose(s.handleRequest(RequestURI='http://localhost/path?key=value&emptykey#fragment'))) self.assertEquals(1, len(observer.calledMethods)) method = observer.calledMethods[0] self.assertEquals('handleRequest', method.name) self.assertEquals(0, len(method.args)) self.assertEquals(7, len(method.kwargs)) self.assertTrue('arguments' in method.kwargs, method.kwargs) arguments = method.kwargs['arguments'] self.assertEquals(2, len(arguments)) self.assertEquals(['emptykey', 'key'], sorted(arguments.keys())) self.assertEquals(['value'], arguments['key']) self.assertEquals([''], arguments['emptykey'])
def dna(reactor, host, portNumber, databasePath): #Choose ONE storage strategy: #strategie = HashDistributeStrategy() #irreversible? #strategie = DefaultStrategy() strategie = Md5HashDistributeStrategy() ## Define which parts should be removed from storage on an SRU delete update. storageComponent = StorageComponent(join(databasePath, 'storage'), partsRemovedOnDelete=[NL_DIDL_NORMALISED_PREFIX, NL_DIDL_COMBINED_PREFIX, 'metadata'], strategy=strategie) loggerComponent = Logger(join(databasePath, 'logger')) oaiJazz = OaiJazz(join(databasePath, 'oai', 'data')) return \ (Observable(), (ObservableHttpServer(reactor, portNumber), (PathFilter("/update"), (SRURecordUpdate(), (Amara2Lxml(fromKwarg='amaraNode', toKwarg='lxmlNode'), createUploadHelix(storageComponent, oaiJazz, loggerComponent) ) ) ), (PathFilter('/oai'), #XWAIT: (OaiPmh(repositoryName='repositoryName', adminEmail='adminEmail', batchSize=2, supportXWait=True) (OaiPmh(repositoryName='Gemeenschappelijke Harvester DANS-KB', adminEmail='*****@*****.**', batchSize=100, fixIdentifyBaseURL=True), ## batchSize = number of records before issueing a resumptionToken... (oaiJazz,), (storageComponent,), (OaiProvenance( ## NOTE: If one of the following fields lacks, provenance will NOT be written. nsMap=namespacesMap, baseURL = ('meta', '//*[local-name() = "baseurl"]/text()'), harvestDate = ('meta', '//*[local-name() = "harvestdate"]/text()'), #See: http://www.openarchives.org/OAI/2.0/guidelines-provenance.htm metadataNamespace = (NL_DIDL_NORMALISED_PREFIX, '//mods:mods/namespace::node()[name()="" or name()="mods" or contains(.,"mods")]'), # Some 'magic' here: xpath() function may return different types. # (Namespace) nodes return tuple's instead of an Element Object. (string) Functions return strings, etc... # Since meresco.oai.OaiProvenance handles all return objects from xpath() the same, the results were unpredictable. # This is why we have overriden the XmlCompose. # See: http://lxml.de/xpathxslt.html#xpath-return-values identifier = ('header', '/oai:header/oai:identifier/text()'), datestamp = ('header', '/oai:header/oai:datestamp/text()') ), (storageComponent,) ) ) ), (PathFilter('/rss'), (LoggerRSS( title = 'Gemeenschappelijke Harvester DANS-KB', description = 'Harvester normalisation log for: ', link = 'http://rss.gharvester.dans.knaw.nl/rss', maximumRecords = 30), (loggerComponent, (storageComponent,) ) ) ) ) )
def main(reactor, portNumber, dumpdir): isdir(dumpdir) or makedirs(dumpdir) server = be( (Observable(), (ObservableHttpServer(reactor, portNumber), (Dump(dumpdir),) ) ) ) list(compose(server.once.observer_init()))
def testCompressResponseFlag(self): reactor = CallTrace('Reactor') s = ObservableHttpServer(reactor, 0) s.startServer() httpserver = s._httpserver self.assertEquals(True, httpserver._compressResponse) s = ObservableHttpServer(reactor, 0, compressResponse=True) s.startServer() httpserver = s._httpserver self.assertEquals(True, httpserver._compressResponse) s = ObservableHttpServer(reactor, 0, compressResponse=False) s.startServer() httpserver = s._httpserver self.assertEquals(False, httpserver._compressResponse)
def testMaxConnectionsErrorHandling(self): observer = CallTrace('Observer', methods={ 'handleRequest': lambda *a, **kw: (x for x in []) }) reactor = CallTrace('Reactor') s = ObservableHttpServer(reactor, 1024, maxConnections=5) s.addObserver(observer) result = ''.join(s._error(ResponseCode=503, something='bicycle')) self.assertEquals(1, len(observer.calledMethods)) self.assertEquals('logHttpError', observer.calledMethods[0].name) self.assertEquals({ 'ResponseCode': 503, 'something': 'bicycle' }, observer.calledMethods[0].kwargs) header, body = result.split(CRLF * 2) self.assertTrue(header.startswith('HTTP/1.0 503'), header) self.assertTrue('Service Unavailable' in body, body)
def dna(reactor, port, dynamic, static, verbose=True): return (Observable(), (ObservableHttpServer(reactor, port=port), (LogCollector(), (ApacheLogWriter(stdout if verbose else None), ), (HandleRequestLog(), (BasicHttpHandler(), (PathFilter('/static'), (PathRename(lambda path: path[len('/static'):]), (FileServer(static), ))), (PathFilter('/', excluding=['/static']), (DynamicHtml([dynamic], reactor=reactor, indexPage='/index'), )))))))
def testSetMaximumConnections(self): reactor = CallTrace('Reactor') s = ObservableHttpServer(reactor, 2048, maxConnections=5) s.startServer() try: httpserver = s._httpserver self.assertEqual(5, httpserver._maxConnections) s.setMaxConnections(6) self.assertEqual(6, httpserver._maxConnections) self.assertEqual( 6, httpserver._acceptor._sinkFactory('a sink')._maxConnections) finally: s.shutdown()
def startOaiPmh(self, portNumber, oaiJazz, storageComponent, register): getVMEnv().attachCurrentThread() with Reactor() as reactor: server = be( (Observable(), (ObservableHttpServer(reactor, portNumber), (OaiPmh(repositoryName='repositoryName', adminEmail='adminEmail', batchSize=2, supportXWait=True), (register, ), ( oaiJazz, (register, ), ), (storageComponent, ))))) list(compose(server.once.observer_init())) self._loopReactor(reactor)
def testServerWithPrio(self): reactor = CallTrace('reactor') s = ObservableHttpServer(reactor, 2000, prio=3) s.observer_init() try: self.assertEqual(['addReader'], reactor.calledMethodNames()) self.assertEqual(3, reactor.calledMethods[0].kwargs['prio']) finally: s.shutdown()
def main(reactor, port, statePath, gatewayPort, dbConfig, quickCommit=False, **ignored): #TODO: Implement logging. # normLogger = Logger(join(statePath, '..', 'gateway', 'normlogger')) dbStorageComponent = ResolverStorageComponent(dbConfig) verbose = True periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule( period=.1 if quickCommit else 10 ), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need <=1 second! Otherwise tests will fail! name='resolver', autoStart=True) oaiDownload = OaiDownloadProcessor(path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join( statePath, 'harvesterstate', 'gateway'), userAgentAddition='ResolverServer', xWait=True, name='resolver', autoCommit=False) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, dbStorageComponent), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter("/"), (StringServer("Resolver Server", ContentTypePlainText), ) ) ) ) )
def main(reactor, port, directory): dumpdir = join(directory, 'dump') isdir(dumpdir) or makedirs(dumpdir) dump = Dump(dumpdir) oaiStorage = MultiSequentialStorage(join(directory, 'storage')) oaiJazz = OaiJazz(join(directory, 'oai')) server = be( (Observable(), (ObservableHttpServer(reactor, port), (PathFilter("/dump"), (dump, )), (PathFilter("/control"), ( Control(), (dump, ), (Log(), ), )), (PathFilter('/oai'), (Log(), ( OaiPmh(repositoryName="Oai Test Server", adminEmail="*****@*****.**", batchSize=10), (oaiStorage, ), (oaiJazz, ), ))), (PathFilter('/badoai'), (Log(), (BadOai(), ))), (PathFilter("/log"), (RetrieveLog(), (Log(), ))), (PathFilter("/ready"), (StringServer('yes', ContentTypePlainText), ))))) list(compose(server.once.observer_init())) oaiJazz.updateMetadataFormat( prefix="oai_dc", schema="http://www.openarchives.org/OAI/2.0/oai_dc.xsd", namespace="http://www.openarchives.org/OAI/2.0/oai_dc/") for i in range(1, 16): if i == 2: identifier = 'oai:record:02/&gkn' else: identifier = 'oai:record:%02d' % i oaiStorage.addData( identifier=identifier, name='oai_dc', data=bytes( '''<oai_dc:dc xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xmlns:dc="http://purl.org/dc/elements/1.1/" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/oai_dc/ http://www.openarchives.org/OAI/2.0/oai_dc.xsd"><dc:identifier>%s</dc:identifier><dc:title>Title is √</dc:title></oai_dc:dc>''' % escapeXml(identifier), encoding='utf-8')) oaiJazz.addOaiRecord(identifier=identifier, metadataPrefixes=['oai_dc']) if i in [3, 6]: list(compose(oaiJazz.delete(identifier=identifier)))
def main(port): reactor = Reactor() dna = (Observable(), (ObservableHttpServer(reactor, int(port)), (DynamicHtml( [join(mydir, "dynamic")], reactor=reactor, indexPage="/index", additionalGlobals={ 'httpget': httpget, 'urlencode': urlencode, 'json_loads': loads, 'namespaces': namespaces }), ))) server = be(dna) list(compose(server.once.observer_init())) reactor.loop()
def main(reactor, port, statePath, **ignored): oaiSuspendRegister = SuspendRegister() oaiJazz = be((OaiJazz(join(statePath, 'oai'), alwaysDeleteInPrefixes=[NORMALISED_DOC_NAME]), (oaiSuspendRegister, ))) normLogger = Logger(join(statePath, 'normlogger')) # strategie = HashDistributeStrategy() # filename (=partname) is also hashed: difficult to read by human eye... strategie = Md5HashDistributeStrategy() storeComponent = StorageComponent( join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[NORMALISED_DOC_NAME]) return \ (Observable(), # (scheduledCommitPeriodicCall,), # (DebugPrompt(reactor=reactor, port=port+1, globals=locals()),), (ObservableHttpServer(reactor=reactor, port=port), (BasicHttpHandler(), (IpFilter(allowedIps=['127.0.0.1']), (PathFilter('/oaix', excluding=['/oaix/info']), (OaiPmh(repositoryName='Gateway', adminEmail='*****@*****.**', supportXWait=True, batchSize=2000 # Override default batch size of 200. ), (oaiJazz,), (oaiSuspendRegister,), (StorageAdapter(), (storeComponent,), ), ) ), (PathFilter('/oaix/info'), (OaiInfo(reactor=reactor, oaiPath='/oai'), (oaiJazz,), ) ), ), (PathFilter('/update'), (SruRecordUpdate(sendRecordData=False, logErrors=True,), (FilterMessages(allowed=['delete']), (storeComponent,), (oaiJazz,), ), (FilterMessages(allowed=['add']), # (LogComponent("LXML:"),), (Validate([('DIDL container','//didl:DIDL', 'didl.xsd'), ('MODS metadata', '//mods:mods', 'mods-3-6.xsd')]), # (LogComponent("VALIDATED:"),), (AddMetadataDocumentPart(partName='normdoc', fromKwarg='lxmlNode'), (NormaliseDIDL(nsMap=namespacesMap, fromKwarg='lxmlNode'), # Normalise DIDL in partname=normdoc metadata (normLogger,), (NormaliseMODS(nsMap=namespacesMap, fromKwarg='lxmlNode'), # Normalise MODS in partname=normdoc metadata (normLogger,), (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data'), (RewritePartname(NORMALISED_DOC_NAME), # Rename converted part. (storeComponent,), # Store converted/renamed part. ) ), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[NORMALISED_DOC_NAME]), (oaiJazz,), ) ) ) ) ) ) ) ) ) ) )
def main(reactor, port, statePath, **ignored): oaiSuspendRegister = SuspendRegister() oaiJazz = be((OaiJazz(join(statePath, 'oai')), (oaiSuspendRegister, ))) # WST: # strategie = HashDistributeStrategy() # filename (=partname) is also hashed: difficult to read by human eye... strategie = Md5HashDistributeStrategy() storeComponent = StorageComponent( join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[NORMALISED_DOC_NAME]) return \ (Observable(), # (scheduledCommitPeriodicCall,), # (DebugPrompt(reactor=reactor, port=port+1, globals=locals()),), (ObservableHttpServer(reactor=reactor, port=port), (BasicHttpHandler(), (IpFilter(allowedIps=['127.0.0.1']), (PathFilter('/oaix', excluding=['/oaix/info']), (OaiPmh(repositoryName='Gateway', adminEmail='*****@*****.**', supportXWait=True, batchSize=2000 # Override default batch size of 200. ), (oaiJazz,), (oaiSuspendRegister,), (StorageAdapter(), (storeComponent,), ), ) ), (PathFilter('/oaix/info'), (OaiInfo(reactor=reactor, oaiPath='/oai'), (oaiJazz,), ) ), ), (PathFilter('/update'), (SruRecordUpdate(sendRecordData=False, logErrors=True,), (FilterMessages(allowed=['delete']), (storeComponent,), (oaiJazz,), ), (FilterMessages(allowed=['add']), # Does not work? See comments in component... # (AddMetadataFormat(fromKwarg="lxmlNode", name='md_format'), # (LogComponent("AddMetadataFormat"),), # ), (XmlXPath(['srw:recordData/*'], fromKwarg='lxmlNode'), # Stuurt IEDERE matching node in een nieuw bericht door. # (LogComponent("TO LONG CONVERTER:"),), (AddMetadataNamespace(dateformat="%Y-%m-%dT%H:%M:%SZ", fromKwarg='lxmlNode'), # Adds metadataNamespace to meta part in the message. (NormaliseOaiRecord(fromKwarg='lxmlNode'), # Normalises record to: long & original parts. Raises ValidationException if no 'known' metadataformat (XmlPrintLxml(fromKwarg='lxmlNode', toKwarg='data', pretty_print=False), (RewritePartname(NORMALISED_DOC_NAME), # Rename converted part. (storeComponent,), # Store converted/renamed part. ) ) ), (OaiAddDeleteRecordWithPrefixesAndSetSpecs(metadataPrefixes=[NORMALISED_DOC_NAME]), (oaiJazz,), ) ) ) ) ) ) ) ) )
def main(reactor, port, statePath, lucenePort, **ignored): ######## START Lucene Integration ############################################################### defaultLuceneSettings = LuceneSettings( commitTimeout=30, readonly=True,) http11Request = be( (HttpRequest1_1(), (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)),), ) ) luceneIndex = luceneAndReaderConfig(defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort) luceneRoHelix = be( (AdapterToLuceneQuery( defaultCore=DEFAULT_CORE, coreConverters={ DEFAULT_CORE: QueryExpressionToLuceneQueryDict(UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings), } ), (MultiLucene(host='127.0.0.1', port=lucenePort, defaultCore=DEFAULT_CORE), (luceneIndex,), (http11Request,), ) ) ) ######## END Lucene Integration ############################################################### fieldnameRewrites = {} def fieldnameRewrite(name): return fieldnameRewrites.get(name, name) def drilldownFieldnamesTranslate(fieldname): untokenizedName = untokenizedFieldname(fieldname) if untokenizedName in untokenizedFieldnames: fieldname = untokenizedName return fieldnameRewrite(fieldname) convertToComposedQuery = ConvertToComposedQuery( resultsFrom=DEFAULT_CORE, matches=[], drilldownFieldnamesTranslate=drilldownFieldnamesTranslate ) strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME]) # Wat doet dit? cqlClauseConverters = [ RenameFieldForExact( untokenizedFields=untokenizedFieldnames, untokenizedPrefix=UNTOKENIZED_PREFIX, ).filterAndModifier(), SearchTermFilterAndModifier( shouldModifyFieldValue=lambda *args: True, fieldnameModifier=fieldnameRewrite ).filterAndModifier(), ] executeQueryHelix = \ (FilterMessages(allowed=['executeQuery']), (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'), (DrilldownQueries(), (convertToComposedQuery, (luceneRoHelix,), ) ) ) ) return \ (Observable(), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter(['/sru']), (SruParser( host='sru.narcis.nl', port=80, defaultRecordSchema='knaw_short', defaultRecordPacking='xml'), (SruLimitStartRecord(limitBeyond=4000), (SruHandler( includeQueryTimes=False, extraXParameters=[], enableCollectLog=False), #2017-03-24T12:00:33Z 127.0.0.1 3.5K 0.019s - /sru OF (TRUE): 2017-03-24T11:58:53Z 127.0.0.1 2.3K 0.004s 1hits /sru maximumRecords=10&operation=searchRetrieve&query=untokenized.dd_year+exact+%221993%22&recordPacking=xml&recordSchema=knaw_short&startRecord=1&version=1.2 (SruTermDrilldown(),), executeQueryHelix, (StorageAdapter(), (storage,) ) ) ) ) ), (PathFilter('/rss'), (Rss( supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied. title = {'nl':'NARCIS', 'en':'NARCIS'}, description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'}, link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'}, maximumRecords = 20), executeQueryHelix, (RssItem( nsMap=NAMESPACEMAP, title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}), description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}), pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'), linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s', wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'), oai_identifier = ('meta', '//meta:record/meta:id/text()'), language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.') ), (StorageAdapter(), (storage,) ) ) ) ) ) ) )
def testServerWithPrio(self): reactor = CallTrace('reactor') s = ObservableHttpServer(reactor, 2000, prio=3) s.observer_init() self.assertEquals(['addReader'], reactor.calledMethodNames()) self.assertEquals(3, reactor.calledMethods[0].kwargs['prio'])
def dna(reactor, port, dataPath, logPath, statePath, externalUrl, customerLogoUrl, deproxyIps=None, **ignored): environment = createEnvironment(dataPath) harvesterData = environment.createHarvesterData() harvesterDataRetrieve = environment.createHarvesterDataRetrieve() deproxy = Deproxy(deproxyForIps=deproxyIps) repositoryStatus = be( (RepositoryStatus(logPath, statePath), (harvesterData, ) ) ) configDict = JsonDict( logPath=logPath, statePath=statePath, externaUrl=externalUrl, dataPath=dataPath, ) print("Started Metastreams with configuration:\n" + configDict.pretty_print()) userGroup = initializeUserGroupManagement(join(statePath, 'users'), harvesterData) basicHtmlLoginHelix = (BasicHtmlLoginForm( action="/login.action", loginPath="/login", home="/index", rememberMeCookie=False, lang="nl"), (userGroup.basicHtmlObserver,), ) varWwwdataPath = join(statePath, 'www-data', 'var') isdir(varWwwdataPath) or makedirs(varWwwdataPath) staticFilePaths = [] staticFiles = Transparent() for path, libdir in [ ('/js/bootstrap', '/usr/share/javascript/bootstrap5/js'), ('/css/bootstrap', '/usr/share/javascript/bootstrap5/css'), ('/css/bootstrap-icons', '/usr/share/javascript/bootstrap-icons'), ('/js/jquery', '/usr/share/javascript/jquery'), ('/js/jquery-tablesorter', '/usr/share/javascript/jquery-tablesorter'), ('/css/jquery-tablesorter', '/usr/share/javascript/jquery-tablesorter/css'), ('/js/autosize', '/usr/share/javascript/autosize'), ('/static', staticHtmlPath), ('/var', varWwwdataPath), ]: staticFiles.addObserver(StaticFiles(libdir=libdir, path=path)) staticFilePaths.append(path) return \ (Observable(), (ObservableHttpServer(reactor, port), (LogCollector(), (ApacheLogWriter(stdout),), (deproxy, (HandleRequestLog(), (BasicHttpHandler(), (SessionHandler(), (CookieMemoryStore(name="meresco-harvester", timeout=2*60*60), ), (UserFromSession(), (PathFilter("/info/version"), (StringServer(VERSION_STRING, ContentTypePlainText), ) ), (PathFilter("/info/config"), (StringServer(configDict.dumps(), ContentTypeJson), ) ), (PathFilter('/login.action'), basicHtmlLoginHelix ), (staticFiles,), (PathFilter('/', excluding=['/info/version', '/info/config', '/action', '/login.action'] + harvesterDataRetrieve.paths + staticFilePaths), (SecureZone("/login", excluding=["/index", "/invalid", "/rss", '/running.rss', '/showHarvesterStatus'], defaultLanguage="nl"), (PathFilter('/', excluding=userGroup.excludedPaths), (DynamicHtml( [dynamicHtmlPath], reactor=reactor, additionalGlobals={ 'externalUrl': externalUrl, 'escapeXml': escapeXml, 'compose': compose, 'dumps': dumps, 'VERSION': VERSION, 'CONFIG': configDict, 'Timeslot': Timeslot, 'ThroughputAnalyser': ThroughputAnalyser, 'dateSince': dateSince, 'callable': callable, 'OnlineHarvest': OnlineHarvest, 'StringIO': StringIO, 'okPlainText': okPlainText, 'ZuluTime': ZuluTime, 'xpathFirst': xpathFirst, 'customerLogoUrl': customerLogoUrl, 'uuid': lambda: str(uuid4()), }, indexPage="/index", ), basicHtmlLoginHelix, (harvesterData,), (repositoryStatus,), (userGroup.dynamicHtmlObserver,), ) ), (userGroup.actions,), ), ), (PathFilter('/action'), (HarvesterDataActions(), (harvesterData,) ), ), (PathFilter(harvesterDataRetrieve.paths), (harvesterDataRetrieve, (FilterFields(), (harvesterData,), ), (repositoryStatus,), ) ) ) ) ) ) ) ) ) )
def main(reactor, port, statePath, gatewayPort, quickCommit=False, **ignored): strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[ NL_DIDL_NORMALISED_PREFIX, NL_DIDL_COMBINED_PREFIX, 'metadata' ]) oaiJazz = OaiJazz(join(statePath, 'oai')) oaiJazz.updateMetadataFormat("metadata", "http://didl.loc.nl/didl.xsd", NAMESPACEMAP.didl) oaiJazz.updateMetadataFormat(NL_DIDL_COMBINED_PREFIX, "", NAMESPACEMAP.gmhcombined) oaiJazz.updateMetadataFormat(NL_DIDL_NORMALISED_PREFIX, "", NAMESPACEMAP.gmhnorm) normLogger = Logger(join(statePath, '..', 'gateway', 'normlogger')) periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule( period=.1 if quickCommit else 10 ), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need <=1 second! Otherwise tests will fail! name='api', autoStart=True) oaiDownload = OaiDownloadProcessor(path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join( statePath, 'harvesterstate', 'gateway'), userAgentAddition='ApiServer', xWait=True, name='api', autoCommit=False) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter('/oai'), (OaiPmh( repositoryName="Gemeenschappelijke Metadata Harvester DANS-KB", adminEmail="*****@*****.**", externalUrl="http://oai.gharvester.dans.knaw.nl", batchSize=200, supportXWait=False, # preciseDatestamp=False, # deleteInSets=False ), (oaiJazz, ), (RetrieveToGetDataAdapter(), (storage,), ), (OaiBranding( url="https://www.narcis.nl/images/logos/logo-knaw-house.gif", #TODO: Link to a joint-GMH icon... link="https://harvester.dans.knaw.nl", title="Gemeenschappelijke Metadata Harvester (GMH) van DANS en de KB"), ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:metadataPrefix/text()'), #TODO: Kan hardcoded in harvester mapper gezet eventueel: <metadataNamespace>urn:mpeg:mpeg21:2002:01-DII-NS</metadataNamespace>?? (storage,) #metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (RetrieveToGetDataAdapter(), (storage,), ) ) ) ), (PathFilter('/rss'), (LoggerRSS( title = 'GMH DANS-KB Normalisationlog Syndication', description = 'Harvester normalisation log for: ', link = 'http://rss.gharvester.dans.knaw.nl/rss', maximumRecords = 30), (normLogger, (storage,) ) ) ), (PathFilter('/xls'), # (LogComponent("XLS-Request:"),), (XlsServer(),) ) ) ) )
def main(reactor, port, statePath, lucenePort, gatewayPort, quickCommit=False, **ignored): ######## START Lucene Integration ############################################################### defaultLuceneSettings = LuceneSettings( commitTimeout=30, readonly=True, ) http11Request = be(( HttpRequest1_1(), (SocketPool(reactor=reactor, unusedTimeout=5, limits=dict(totalSize=100, destinationSize=10)), ), )) luceneIndex = luceneAndReaderConfig( defaultLuceneSettings.clone(readonly=True), http11Request, lucenePort) luceneRoHelix = be( (AdapterToLuceneQuery(defaultCore=DEFAULT_CORE, coreConverters={ DEFAULT_CORE: QueryExpressionToLuceneQueryDict( UNQUALIFIED_TERM_FIELDS, luceneSettings=luceneIndex.settings), }), ( MultiLucene(host='localhost', port=lucenePort, defaultCore=DEFAULT_CORE), (luceneIndex, ), (http11Request, ), ))) ######## END Lucene Integration ############################################################### fieldnameRewrites = { # UNTOKENIZED_PREFIX+'genre': UNTOKENIZED_PREFIX+'dc:genre', } def fieldnameRewrite(name): return fieldnameRewrites.get(name, name) def drilldownFieldnamesTranslate(fieldname): untokenizedName = untokenizedFieldname(fieldname) if untokenizedName in untokenizedFieldnames: fieldname = untokenizedName return fieldnameRewrite(fieldname) convertToComposedQuery = ConvertToComposedQuery( resultsFrom=DEFAULT_CORE, matches=[], drilldownFieldnamesTranslate=drilldownFieldnamesTranslate) strategie = Md5HashDistributeStrategy() storage = StorageComponent(join(statePath, 'store'), strategy=strategie, partsRemovedOnDelete=[ HEADER_PARTNAME, META_PARTNAME, METADATA_PARTNAME, OAI_DC_PARTNAME, LONG_PARTNAME, SHORT_PARTNAME, OPENAIRE_PARTNAME ]) oaiJazz = OaiJazz(join(statePath, 'oai')) oaiJazz.updateMetadataFormat( OAI_DC_PARTNAME, "http://www.openarchives.org/OAI/2.0/oai_dc.xsd", "http://purl.org/dc/elements/1.1/") oai_oa_cerifJazz = OaiJazz(join(statePath, 'oai_cerif')) oai_oa_cerifJazz.updateMetadataFormat( OPENAIRE_PARTNAME, "https://www.openaire.eu/schema/cris/current/openaire-cerif-profile.xsd", "https://www.openaire.eu/cerif-profile/1.1/") # All of the following OAI-PMH sets shall be recognized by the CRIS, even if not all of them are populated. oai_oa_cerifJazz.updateSet("openaire_cris_projects", "OpenAIRE_CRIS_projects") oai_oa_cerifJazz.updateSet("openaire_cris_orgunits", "OpenAIRE_CRIS_orgunits") oai_oa_cerifJazz.updateSet("openaire_cris_persons", "OpenAIRE_CRIS_persons") oai_oa_cerifJazz.updateSet("openaire_cris_patents", "OpenAIRE_CRIS_patents") oai_oa_cerifJazz.updateSet("openaire_cris_products", "OpenAIRE_CRIS_products") oai_oa_cerifJazz.updateSet("openaire_cris_publications", "OpenAIRE_CRIS_publications") oai_oa_cerifJazz.updateSet("openaire_cris_funding", "OpenAIRE_CRIS_funding") oai_oa_cerifJazz.updateSet("openaire_cris_events", "OpenAIRE_CRIS_events") oai_oa_cerifJazz.updateSet("openaire_cris_equipments", "OpenAIRE_CRIS_equipments") cqlClauseConverters = [ RenameFieldForExact( untokenizedFields=untokenizedFieldnames, untokenizedPrefix=UNTOKENIZED_PREFIX, ).filterAndModifier(), SearchTermFilterAndModifier( shouldModifyFieldValue=lambda *args: True, fieldnameModifier=fieldnameRewrite).filterAndModifier(), ] periodicGateWayDownload = PeriodicDownload( reactor, host='localhost', port=gatewayPort, schedule=Schedule( period=1 if quickCommit else 10 ), # WST: Interval in seconds before sending a new request to the GATEWAY in case of an error while processing batch records.(default=1). IntegrationTests need 1 second! Otherwise tests will fail! name='api', autoStart=True) oaiDownload = OaiDownloadProcessor(path='/oaix', metadataPrefix=NORMALISED_DOC_NAME, workingDirectory=join( statePath, 'harvesterstate', 'gateway'), userAgentAddition='ApiServer', xWait=True, name='api', autoCommit=False) executeQueryHelix = \ (FilterMessages(allowed=['executeQuery']), (CqlMultiSearchClauseConversion(cqlClauseConverters, fromKwarg='query'), (DrilldownQueries(), (convertToComposedQuery, (luceneRoHelix,), ) ) ) ) return \ (Observable(), createDownloadHelix(reactor, periodicGateWayDownload, oaiDownload, storage, oaiJazz, oai_oa_cerifJazz), (ObservableHttpServer(reactor, port, compressResponse=True), (BasicHttpHandler(), (PathFilter(["/oai"]), (OaiPmh(repositoryName="NARCIS OAI-pmh", adminEmail="*****@*****.**", externalUrl="http://oai.narcis.nl"), (oaiJazz,), (StorageAdapter(), (storage,) ), (OaiBranding( url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", link="http://oai.narcis.nl", title="Narcis - The gateway to scholarly information in The Netherlands"), ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:record/meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (storage,) ) ) ), (PathFilter(["/cerif"]), (OaiPmhDans(repositoryName="OpenAIRE CERIF", adminEmail="*****@*****.**", repositoryIdentifier="services.nod.dans.knaw.nl", externalUrl="http://services.nod.dans.knaw.nl"), #TODO: pathFilter should resemble proxy path (oai_oa_cerifJazz,), (StorageAdapter(), (storage,) ), (OaiOpenAIREDescription( serviceid='organisation:ORG1242054', acronym='services.nod.dans.knaw.nl', name='NARCIS', description='Compliant with the OpenAIRE Guidelines for CRIS Managers v.1.1.', website='https://www.narcis.nl', baseurl='http://services.nod.dans.knaw.nl/oa-cerif', subjectheading='', orgunitid='organisation:ORG1242054', owneracronym='DANS'), ), # (OaiBranding( # url="http://www.narcis.nl/images/logos/logo-knaw-house.gif", # link="http://oai.narcis.nl", # title="Narcis - The gateway to scholarly information in The Netherlands"), # ), (OaiProvenance( nsMap=NAMESPACEMAP, baseURL=('meta', '//meta:repository/meta:baseurl/text()'), harvestDate=('meta', '//meta:record/meta:harvestdate/text()'), metadataNamespace=('meta', '//meta:record/meta:metadataNamespace/text()'), identifier=('header','//oai:identifier/text()'), datestamp=('header', '//oai:datestamp/text()') ), (storage,) ) ) ), (PathFilter(['/sru']), (SruParser( host='sru.narcis.nl', port=80, defaultRecordSchema='knaw_short', defaultRecordPacking='xml'), (SruLimitStartRecord(limitBeyond=4000), (SruHandler( includeQueryTimes=False, extraXParameters=[], enableCollectLog=False), (SruTermDrilldown(),), executeQueryHelix, (StorageAdapter(), (storage,) ) ) ) ) ), (PathFilter('/rss'), (Rss( supportedLanguages = ['nl','en'], # defaults to first, if requested language is not available or supplied. title = {'nl':'NARCIS', 'en':'NARCIS'}, description = {'nl':'NARCIS: De toegang tot de Nederlandse wetenschapsinformatie', 'en':'NARCIS: The gateway to Dutch scientific information'}, link = {'nl':'http://www.narcis.nl/?Language=nl', 'en':'http://www.narcis.nl/?Language=en'}, maximumRecords = 20), executeQueryHelix, (RssItem( nsMap=NAMESPACEMAP, title = ('knaw_short', {'nl':'//short:metadata/short:titleInfo[not (@xml:lang)]/short:title/text()', 'en':'//short:metadata/short:titleInfo[@xml:lang="en"]/short:title/text()'}), description = ('knaw_short', {'nl':'//short:abstract[not (@xml:lang)]/text()', 'en':'//short:abstract[@xml:lang="en"]/text()'}), pubdate = ('knaw_short', '//short:dateIssued/short:parsed/text()'), linkTemplate = 'http://www.narcis.nl/%(wcpcollection)s/RecordID/%(oai_identifier)s/Language/%(language)s', wcpcollection = ('meta', '//*[local-name() = "collection"]/text()'), oai_identifier = ('meta', '//meta:record/meta:id/text()'), language = ('Dummy: Language is auto provided by the calling RSS component, but needs to be present to serve the linkTemplate.') ), (StorageAdapter(), (storage,) ) ) ) ) ) ) )
def dna(reactor, port, **kwargs): return (Observable(), (ObservableHttpServer(reactor, bindAddress='0.0.0.0', port=port), (BasicHttpHandler(), (RequestEcho(), ))))
def createServer(reactor, port, solrPort): return be((Observable(), (ObservableHttpServer(reactor, port), (_HelperHandler(), (SolrInterface(host='localhost', port=solrPort, core='records'), )))))
def main(reactor, port, databasePath): drilldownFields = [ DrilldownField('untokenized.field2'), DrilldownField('untokenized.fieldHier', hierarchical=True) ] fieldRegistry = FieldRegistry(drilldownFields) luceneSettings = LuceneSettings(fieldRegistry=fieldRegistry, commitCount=30, commitTimeout=1, analyzer=MerescoDutchStemmingAnalyzer()) lucene = Lucene(path=join(databasePath, 'lucene'), reactor=reactor, name='main', settings=luceneSettings) lucene2Settings = LuceneSettings(fieldRegistry=fieldRegistry, commitTimeout=0.1) lucene2 = Lucene(path=join(databasePath, 'lucene2'), reactor=reactor, name='main2', settings=lucene2Settings) termNumerator = TermNumerator(path=join(databasePath, 'termNumerator')) emptyLuceneSettings = LuceneSettings(commitTimeout=1) multiLuceneHelix = ( MultiLucene(defaultCore='main'), (Lucene(path=join(databasePath, 'lucene-empty'), reactor=reactor, name='empty-core', settings=emptyLuceneSettings), ), (lucene, ), (lucene2, ), ) storageComponent = StorageComponent( directory=join(databasePath, 'storage')) return \ (Observable(), (ObservableHttpServer(reactor=reactor, port=port), (BasicHttpHandler(), (ApacheLogger(outputStream=stdout), (PathFilter("/info", excluding=[ '/info/version', '/info/name', '/update', '/sru', '/remote', '/via-remote-sru', ]), (DynamicHtml( [dynamicPath], reactor=reactor, indexPage='/info', additionalGlobals={ 'VERSION': version, } ), ) ), (PathFilter("/info/version"), (StringServer(version, ContentTypePlainText), ) ), (PathFilter("/info/name"), (StringServer('Meresco Lucene', ContentTypePlainText),) ), (PathFilter("/static"), (PathRename(lambda path: path[len('/static'):]), (FileServer(staticPath),) ) ), (PathFilter("/update_main", excluding=['/update_main2']), uploadHelix(lucene, termNumerator, storageComponent, drilldownFields, fieldRegistry=luceneSettings.fieldRegistry), ), (PathFilter("/update_main2"), uploadHelix(lucene2, termNumerator, storageComponent, drilldownFields, fieldRegistry=lucene2Settings.fieldRegistry), ), (PathFilter('/sru'), (SruParser(defaultRecordSchema='record'), (SruHandler(), (MultiCqlToLuceneQuery( defaultCore='main', coreToCqlLuceneQueries={ "main": CqlToLuceneQuery([], luceneSettings=luceneSettings), "main2": CqlToLuceneQuery([], luceneSettings=lucene2Settings), "empty-core": CqlToLuceneQuery([], luceneSettings=emptyLuceneSettings), }), multiLuceneHelix, ), (SRUTermDrilldown(defaultFormat='xml'),), (SruDuplicateCount(),), (storageComponent,), ) ) ), (PathFilter('/via-remote-sru'), (SruParser(defaultRecordSchema='record'), (SruHandler(), (LuceneRemote(host='localhost', port=port, path='/remote'),), (SRUTermDrilldown(defaultFormat='xml'),), (SruDuplicateCount(),), (storageComponent,), ) ) ), (PathFilter('/remote'), (LuceneRemoteService(reactor=reactor), (MultiCqlToLuceneQuery( defaultCore='main', coreToCqlLuceneQueries={ "main": CqlToLuceneQuery([], luceneSettings=luceneSettings), "main2": CqlToLuceneQuery([], luceneSettings=lucene2Settings), "empty-core": CqlToLuceneQuery([], luceneSettings=emptyLuceneSettings), }), multiLuceneHelix, ) ) ), (PathFilter('/autocomplete'), (Autocomplete('localhost', port, '/autocomplete', '__all__', '?', 5, '?', '?'), (lucene,), ) ) ) ) ) )