def test_resumption_token(self): """oairepository - testing harvesting with bad resumption token""" # Non existing resumptionToken req = StringIO() oai_repository_server.oai_list_records_or_identifiers(req, {'resumptionToken': 'foobar', 'verb': 'ListRecords'}) self.assert_('badResumptionToken' in req.getvalue())
def test_resumption_token(self): """oairepository - testing harvesting with bad resumption token""" # Non existing resumptionToken req = StringIO() oai_repository_server.oai_list_records_or_identifiers(req, {"resumptionToken": "foobar", "verb": "ListRecords"}) self.assert_("badResumptionToken" in req.getvalue())
def test_hidden_fields(self): """oairepository - not exposing hidden fields""" req = StringIO() # List available records, get datestamps and play with them oai_repository_server.oai_list_records_or_identifiers(req, {'verb': 'GetRecord', 'metadataPrefix': 'marcxml', 'identifier': 'oai:atlantis.cern.ch:12'}) result = req.getvalue() self.failIf("<error" in result, "Errors found in result: %s" % result) self.failIf('<marc:datafield tag="595" ind1=" " ind2=" " >' in result, "Hidden field 595 found in result: %s" % result)
def test_resumption_token(self): """oairepository - testing harvesting with bad resumption token""" # Non existing resumptionToken req = StringIO() oai_repository_server.oai_list_records_or_identifiers( req, { 'resumptionToken': 'foobar', 'verb': 'ListRecords' }) self.assert_('badResumptionToken' in req.getvalue())
def test_response_speed_marcxml(self): """oairepository - speed of response for marcxml output""" allowed_seconds_per_record_marcxml = 0.05 # Test marcxml ListRecords performance t0 = time.time() oai_repository_server.oai_list_records_or_identifiers(StringIO(), argd={'metadataPrefix': 'marcxml', 'verb': 'ListRecords'}) t = time.time() - t0 if t > self.number_of_records * allowed_seconds_per_record_marcxml: self.fail("""Response for ListRecords with metadataPrefix=marcxml took too much time:\n %s seconds. Limit: %s seconds""" % (t, self.number_of_records * allowed_seconds_per_record_marcxml))
def test_response_speed_oai(self): """oairepository - speed of response for oai_dc output""" allowed_seconds_per_record_oai = 0.03 # Test oai ListRecords performance t0 = time.time() oai_repository_server.oai_list_records_or_identifiers(StringIO(), {'metadataPrefix': 'oai_dc', 'verb': 'ListRecords'}) t = time.time() - t0 if t > self.number_of_records * allowed_seconds_per_record_oai: self.fail("""Response for ListRecords with metadataPrefix=oai_dc took too much time: %s seconds. Limit: %s seconds""" % (t, self.number_of_records * allowed_seconds_per_record_oai))
def test_verbs(self): """oairepository - testing verbs""" self.assertNotEqual(None, re.search("Identify", oai_repository_server.oai_identify({'verb': 'Identify'}))) ret = StringIO() oai_repository_server.oai_list_records_or_identifiers(ret, {'verb': 'ListIdentifiers', 'metadataPrefix': 'marcxml'}) self.assertNotEqual(None, re.search("ListIdentifiers", ret.getvalue())) ret = StringIO() oai_repository_server.oai_list_records_or_identifiers(ret, {'verb': 'ListRecords', 'metadataPrefix': 'marcxml'}) self.assertNotEqual(None, re.search("ListRecords", ret.getvalue())) self.assertNotEqual(None, re.search("ListMetadataFormats", oai_repository_server.oai_list_metadata_formats({'verb': 'ListMetadataFormats'}))) self.assertNotEqual(None, re.search("ListSets", oai_repository_server.oai_list_sets({'verb': 'ListSets'}))) self.assertNotEqual(None, re.search("GetRecord", oai_repository_server.oai_get_record({'identifier': 'oai:atlantis.cern.ch:1', 'verb': 'GetRecord'})))
def test_hidden_fields(self): """oairepository - not exposing hidden fields""" req = StringIO() # List available records, get datestamps and play with them oai_repository_server.oai_list_records_or_identifiers( req, { 'verb': 'GetRecord', 'metadataPrefix': 'marcxml', 'identifier': 'oai:atlantis.cern.ch:12' }) result = req.getvalue() self.failIf("<error" in result, "Errors found in result: %s" % result) self.failIf('<marc:datafield tag="595" ind1=" " ind2=" " >' in result, "Hidden field 595 found in result: %s" % result)
def test_from_and_until(self): """oairepository - testing selective harvesting with 'from' and 'until' parameters""" req = StringIO() # List available records, get datestamps and play with them oai_repository_server.oai_list_records_or_identifiers(req, {'verb': 'ListIdentifiers', 'metadataPrefix': 'marcxml'}) identifiers = req.getvalue() datestamps = re.findall('<identifier>(?P<id>.*?)</identifier>\s*<datestamp>(?P<date>.*?)</datestamp>', identifiers, re.M) sample_datestamp = datestamps[0][1] # Take one datestamp sample_oai_id = datestamps[0][0] # Take corresponding oai id sample_id = search_engine.perform_request_search(p=sample_oai_id, f=CFG_OAI_ID_FIELD)[0] # Find corresponding system number id # There must be some datestamps self.assertNotEqual([], datestamps) # We must be able to retrieve an id with the date we have just found self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp), "%s not in %s (fromdate=%s)" % (sample_id, oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp), sample_datestamp)) self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp), "%s not in %s" % (sample_id, oai_repository_server.oai_get_recid_list(untildate=sample_datestamp))) self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp, \ fromdate=sample_datestamp)) # Same, with short format date. Eg 2007-12-13 self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp.split('T')[0])) self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp.split('T')[0])) self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp.split('T')[0], \ untildate=sample_datestamp.split('T')[0])) # At later date (year after) we should not find our id again sample_datestamp_year = int(sample_datestamp[0:4]) sample_datestamp_rest = sample_datestamp[4:] later_datestamp = str(sample_datestamp_year + 1) + sample_datestamp_rest self.assert_(sample_id not in oai_repository_server.oai_get_recid_list(fromdate=later_datestamp)) # At earlier date (year before) we should not find our id again earlier_datestamp = str(sample_datestamp_year - 1) + sample_datestamp_rest self.assert_(sample_id not in oai_repository_server.oai_get_recid_list(untildate=earlier_datestamp)) # From earliest date to latest date must include all oai records dates = [(time.mktime(time.strptime(date[1], "%Y-%m-%dT%H:%M:%SZ")), date[1]) for date in datestamps] dates = dict(dates) sorted_times = dates.keys() sorted_times.sort() earliest_datestamp = dates[sorted_times[0]] latest_datestamp = dates[sorted_times[-1]] self.assertEqual(oai_repository_server.oai_get_recid_list(), \ oai_repository_server.oai_get_recid_list(fromdate=earliest_datestamp, \ untildate=latest_datestamp))
def test_response_speed_oai(self): """oairepository - speed of response for oai_dc output""" allowed_seconds_per_record_oai = 0.02 # Test oai ListRecords performance t0 = time.time() oai_repository_server.oai_list_records_or_identifiers( StringIO(), { 'metadataPrefix': 'oai_dc', 'verb': 'ListRecords' }) t = time.time() - t0 if t > self.number_of_records * allowed_seconds_per_record_oai: self.fail( """Response for ListRecords with metadataPrefix=oai_dc took too much time: %s seconds. Limit: %s seconds""" % (t, self.number_of_records * allowed_seconds_per_record_oai))
def test_response_speed_marcxml(self): """oairepository - speed of response for marcxml output""" allowed_seconds_per_record_marcxml = 0.05 # Test marcxml ListRecords performance t0 = time.time() oai_repository_server.oai_list_records_or_identifiers( StringIO(), argd={ 'metadataPrefix': 'marcxml', 'verb': 'ListRecords' }) t = time.time() - t0 if t > self.number_of_records * allowed_seconds_per_record_marcxml: self.fail( """Response for ListRecords with metadataPrefix=marcxml took too much time:\n %s seconds. Limit: %s seconds""" % (t, self.number_of_records * allowed_seconds_per_record_marcxml))
def test_touching_set(self): """oairepository - touch a set""" req = StringIO() oai_repository_server.oai_list_records_or_identifiers( req, { 'verb': 'ListIdentifiers', 'metadataPrefix': 'marcxml', 'set': 'cern:experiment' }) response = req.getvalue() current_timestamps = re.findall("<datestamp>(.*?)</datestamp>", response) current_timestamps = [ datetime(*time.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')[:-3]) for timestamp in current_timestamps ] last_timestamp = max(current_timestamps) future_timestamp = last_timestamp + timedelta( 0, 5) ## 5 seconds in the future to the last record future_timestamp = future_timestamp.strftime('%Y-%m-%dT%H:%M:%SZ') req = StringIO() oai_repository_server.oai_list_records_or_identifiers( req, { 'verb': 'ListIdentifiers', 'metadataPrefix': 'marcxml', 'set': 'cern:experiment', 'from': future_timestamp }) response = req.getvalue() self.failIf(re.findall("<datestamp>(.*?)</datestamp>", response)) from invenio.oai_repository_admin import touch_oai_set touch_oai_set('cern:experiment') req = StringIO() oai_repository_server.oai_list_records_or_identifiers( req, { 'verb': 'ListIdentifiers', 'metadataPrefix': 'marcxml', 'set': 'cern:experiment', 'from': future_timestamp }) response = req.getvalue() new_timestamps = re.findall("<datestamp>(.*?)</datestamp>", response) new_timestamps = [ datetime(*time.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')[:-3]) for timestamp in new_timestamps ] self.assertEqual( len(new_timestamps), len(current_timestamps), "new %s, old %s, from: %s" % (new_timestamps, current_timestamps, future_timestamp)) self.failUnless(new_timestamps > current_timestamps)
def test_touching_set(self): """oairepository - touch a set""" req = StringIO() oai_repository_server.oai_list_records_or_identifiers( req, {"verb": "ListIdentifiers", "metadataPrefix": "marcxml", "set": "cern:experiment"} ) response = req.getvalue() current_timestamps = re.findall("<datestamp>(.*?)</datestamp>", response) current_timestamps = [ datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:-3]) for timestamp in current_timestamps ] last_timestamp = max(current_timestamps) future_timestamp = last_timestamp + timedelta(0, 5) ## 5 seconds in the future to the last record future_timestamp = future_timestamp.strftime("%Y-%m-%dT%H:%M:%SZ") req = StringIO() oai_repository_server.oai_list_records_or_identifiers( req, { "verb": "ListIdentifiers", "metadataPrefix": "marcxml", "set": "cern:experiment", "from": future_timestamp, }, ) response = req.getvalue() self.failIf(re.findall("<datestamp>(.*?)</datestamp>", response)) from invenio.oai_repository_admin import touch_oai_set touch_oai_set("cern:experiment") req = StringIO() oai_repository_server.oai_list_records_or_identifiers( req, { "verb": "ListIdentifiers", "metadataPrefix": "marcxml", "set": "cern:experiment", "from": future_timestamp, }, ) response = req.getvalue() new_timestamps = re.findall("<datestamp>(.*?)</datestamp>", response) new_timestamps = [ datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:-3]) for timestamp in new_timestamps ] self.assertEqual( len(new_timestamps), len(current_timestamps), "new %s, old %s, from: %s" % (new_timestamps, current_timestamps, future_timestamp), ) self.failUnless(new_timestamps > current_timestamps)
def test_from_and_until(self): """oairepository - testing selective harvesting with 'from' and 'until' parameters""" req = StringIO() # List available records, get datestamps and play with them oai_repository_server.oai_list_records_or_identifiers( req, { 'verb': 'ListIdentifiers', 'metadataPrefix': 'marcxml' }) identifiers = req.getvalue() datestamps = re.findall( '<identifier>(?P<id>.*?)</identifier>\s*<datestamp>(?P<date>.*?)</datestamp>', identifiers, re.M) sample_datestamp = datestamps[0][1] # Take one datestamp sample_oai_id = datestamps[0][0] # Take corresponding oai id sample_id = search_engine.perform_request_search( p=sample_oai_id, f=CFG_OAI_ID_FIELD)[0] # Find corresponding system number id # There must be some datestamps self.assertNotEqual([], datestamps) # We must be able to retrieve an id with the date we have just found self.assert_(sample_id in oai_repository_server.oai_get_recid_list( fromdate=sample_datestamp)) self.assert_(sample_id in oai_repository_server.oai_get_recid_list( untildate=sample_datestamp)) self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp, \ fromdate=sample_datestamp)) # Same, with short format date. Eg 2007-12-13 self.assert_(sample_id in oai_repository_server.oai_get_recid_list( fromdate=sample_datestamp.split('T')[0])) self.assert_(sample_id in oai_repository_server.oai_get_recid_list( untildate=sample_datestamp.split('T')[0])) self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp.split('T')[0], \ untildate=sample_datestamp.split('T')[0])) # At later date (year after) we should not find our id again sample_datestamp_year = int(sample_datestamp[0:4]) sample_datestamp_rest = sample_datestamp[4:] later_datestamp = str(sample_datestamp_year + 1) + sample_datestamp_rest self.assert_(sample_id not in oai_repository_server.oai_get_recid_list( fromdate=later_datestamp)) # At earlier date (year before) we should not find our id again earlier_datestamp = str(sample_datestamp_year - 1) + sample_datestamp_rest self.assert_(sample_id not in oai_repository_server.oai_get_recid_list( untildate=earlier_datestamp)) # From earliest date to latest date must include all oai records dates = [(time.mktime(time.strptime(date[1], "%Y-%m-%dT%H:%M:%SZ")), date[1]) for date in datestamps] dates = dict(dates) sorted_times = dates.keys() sorted_times.sort() earliest_datestamp = dates[sorted_times[0]] latest_datestamp = dates[sorted_times[-1]] self.assertEqual(oai_repository_server.oai_get_recid_list(), \ oai_repository_server.oai_get_recid_list(fromdate=earliest_datestamp, \ untildate=latest_datestamp))
def __call__(self, req, form): """OAI repository interface""" # Clean input arguments. The protocol specifies that an error # has to be returned if the same argument is specified several # times. Eg: # oai2d?verb=ListIdentifiers&metadataPrefix=marcxml&metadataPrefix=marcxml # So keep the arguments as list for now so that check_argd can # return an error if needed (check_argd also transforms these # lists into strings) argd = wash_urlargd(form, {'verb': (list, []), 'metadataPrefix': (list, []), 'from': (list, []), 'until': (list, []), 'set': (list, []), 'identifier': (list, []), 'resumptionToken': (list, []), }) if CFG_VALIDATE_RESPONSES: req.track_writings = True ## wash_urlargd(..) function cleaned everything, but also added ## unwanted parameters. Remove them now for param in argd.keys(): if not param in form and param != 'verb': del argd[param] ## wash_urlargd(..) function also removed unknown parameters ## that we would like to keep in order to send back an error ## as required by the protocol. But we do not need that value, ## so set it to empty string. for param in form.keys(): if param not in argd.keys(): argd[param] = '' ## But still remove 'ln' parameter that was automatically added. if argd.has_key('ln'): del argd['ln'] ## check request for OAI compliancy ## also transform all the list arguments into string oai_errors = oai_repository_server.check_argd(argd) ## check availability (OAI requests for Identify, ListSets and ## ListMetadataFormats are served immediately, otherwise we ## shall wait for CFG_OAI_SLEEP seconds between requests): if os.path.exists("%s/RTdata/RTdata" % CFG_CACHEDIR) and (argd['verb'] not in ["Identify", "ListMetadataFormats", "ListSets"] and not argd.get('resumptionToken')): time_gap = int(time.time() - os.path.getmtime("%s/RTdata/RTdata" % CFG_CACHEDIR)) if(time_gap < CFG_OAI_SLEEP): req.headers_out["Status-Code"] = "503" req.headers_out["Retry-After"] = "%d" % (CFG_OAI_SLEEP - time_gap) req.status = apache.HTTP_SERVICE_UNAVAILABLE return "Retry after %d seconds" % (CFG_OAI_SLEEP - time_gap) command = "touch %s/RTdata/RTdata" % CFG_CACHEDIR os.system(command) ## create OAI response req.content_type = "text/xml" req.send_http_header() if not oai_errors: ## OAI Identify if argd['verb'] == "Identify": req.write(oai_repository_server.oai_identify(argd)) ## OAI ListSets elif argd['verb'] == "ListSets": req.write(oai_repository_server.oai_list_sets(argd)) ## OAI ListIdentifiers or OAI ListRecords elif argd['verb'] in ("ListIdentifiers", "ListRecords"): oai_repository_server.oai_list_records_or_identifiers(req, argd) ## OAI GetRecord elif argd['verb'] == "GetRecord": req.write(oai_repository_server.oai_get_record(argd)) ## OAI ListMetadataFormats elif argd['verb'] == "ListMetadataFormats": req.write(oai_repository_server.oai_list_metadata_formats(argd)) ## Unknown verb ## OAI error else: req.write(oai_repository_server.oai_error(argd, oai_errors)) if CFG_VALIDATE_RESPONSES: req.track_writings = False try: OAI_PMH_VALIDATOR.assertValid(etree.parse(cStringIO.StringIO(req.what_was_written))) except etree.DocumentInvalid: register_exception(req=req, alert_admin=True) raise return "\n"
def __call__(self, req, form): """OAI repository interface""" # Clean input arguments. The protocol specifies that an error # has to be returned if the same argument is specified several # times. Eg: # oai2d?verb=ListIdentifiers&metadataPrefix=marcxml&metadataPrefix=marcxml # So keep the arguments as list for now so that check_argd can # return an error if needed (check_argd also transforms these # lists into strings) argd = wash_urlargd(form, {'verb': (list, []), 'metadataPrefix': (list, []), 'from': (list, []), 'until': (list, []), 'set': (list, []), 'identifier': (list, []), 'resumptionToken': (list, []), }) if CFG_VALIDATE_RESPONSES: req.track_writings = True ## wash_urlargd(..) function cleaned everything, but also added ## unwanted parameters. Remove them now for param in argd.keys(): if not param in form and param != 'verb': del argd[param] ## wash_urlargd(..) function also removed unknown parameters ## that we would like to keep in order to send back an error ## as required by the protocol. But we do not need that value, ## so set it to empty string. for param in form.keys(): if param not in argd.keys(): argd[param] = '' ## But still remove 'ln' parameter that was automatically added. if argd.has_key('ln'): del argd['ln'] ## check request for OAI compliancy ## also transform all the list arguments into string oai_errors = oai_repository_server.check_argd(argd) ## check availability (OAI requests for Identify, ListSets and ## ListMetadataFormats are served immediately, otherwise we ## shall wait for CFG_OAI_SLEEP seconds between requests): if os.path.exists("%s/RTdata/RTdata" % CFG_CACHEDIR) and (argd['verb'] not in ["Identify", "ListMetadataFormats", "ListSets"] and not argd.get('resumptionToken')): time_gap = int(time.time() - os.path.getmtime("%s/RTdata/RTdata" % CFG_CACHEDIR)) if(time_gap < CFG_OAI_SLEEP): req.headers_out["Status-Code"] = "503" req.headers_out["Retry-After"] = "%d" % (CFG_OAI_SLEEP - time_gap) req.status = apache.HTTP_SERVICE_UNAVAILABLE return "Retry after %d seconds" % (CFG_OAI_SLEEP - time_gap) command = "touch %s/RTdata/RTdata" % CFG_CACHEDIR os.system(command) ## create OAI response req.content_type = "text/xml" req.send_http_header() if not oai_errors: ## OAI Identify if argd['verb'] == "Identify": req.write(oai_repository_server.oai_identify(argd)) ## OAI ListSets elif argd['verb'] == "ListSets": req.write(oai_repository_server.oai_list_sets(argd)) ## OAI ListIdentifiers or OAI ListRecords elif argd['verb'] in ("ListIdentifiers", "ListRecords"): oai_repository_server.oai_list_records_or_identifiers(req, argd) ## OAI GetRecord elif argd['verb'] == "GetRecord": req.write(oai_repository_server.oai_get_record(argd)) ## OAI ListMetadataFormats elif argd['verb'] == "ListMetadataFormats": req.write(oai_repository_server.oai_list_metadata_formats(argd)) ## Unknown verb ## OAI error else: req.write(oai_repository_server.oai_error(argd, oai_errors)) if CFG_VALIDATE_RESPONSES: req.track_writings = False try: OAI_PMH_VALIDATOR.assertValid(etree.parse(cStringIO.StringIO(req.what_was_written))) except etree.DocumentInvalid: register_exception(req=req, alert_admin=True) raise return "\n"