def test_resumption_token(self):
        """oairepository - testing harvesting with bad resumption token"""
        # Non existing resumptionToken
        req = StringIO()
        oai_repository_server.oai_list_records_or_identifiers(req, {'resumptionToken': 'foobar', 'verb': 'ListRecords'})

        self.assert_('badResumptionToken' in req.getvalue())
    def test_resumption_token(self):
        """oairepository - testing harvesting with bad resumption token"""
        # Non existing resumptionToken
        req = StringIO()
        oai_repository_server.oai_list_records_or_identifiers(req, {"resumptionToken": "foobar", "verb": "ListRecords"})

        self.assert_("badResumptionToken" in req.getvalue())
 def test_hidden_fields(self):
     """oairepository - not exposing hidden fields"""
     req = StringIO()
     # List available records, get datestamps and play with them
     oai_repository_server.oai_list_records_or_identifiers(req, {'verb': 'GetRecord', 'metadataPrefix': 'marcxml', 'identifier': 'oai:atlantis.cern.ch:12'})
     result = req.getvalue()
     self.failIf("<error" in result, "Errors found in result: %s" % result)
     self.failIf('<marc:datafield tag="595" ind1=" " ind2=" " >' in result, "Hidden field 595 found in result: %s" % result)
Ejemplo n.º 4
0
    def test_resumption_token(self):
        """oairepository - testing harvesting with bad resumption token"""
        # Non existing resumptionToken
        req = StringIO()
        oai_repository_server.oai_list_records_or_identifiers(
            req, {
                'resumptionToken': 'foobar',
                'verb': 'ListRecords'
            })

        self.assert_('badResumptionToken' in req.getvalue())
    def test_response_speed_marcxml(self):
        """oairepository - speed of response for marcxml output"""
        allowed_seconds_per_record_marcxml = 0.05

        # Test marcxml ListRecords performance
        t0 = time.time()
        oai_repository_server.oai_list_records_or_identifiers(StringIO(), argd={'metadataPrefix': 'marcxml', 'verb': 'ListRecords'})
        t = time.time() - t0
        if t > self.number_of_records * allowed_seconds_per_record_marcxml:
            self.fail("""Response for ListRecords with metadataPrefix=marcxml took too much time:\n
%s seconds.
Limit: %s seconds""" % (t, self.number_of_records * allowed_seconds_per_record_marcxml))
    def test_response_speed_oai(self):
        """oairepository - speed of response for oai_dc output"""
        allowed_seconds_per_record_oai = 0.03

        # Test oai ListRecords performance
        t0 = time.time()
        oai_repository_server.oai_list_records_or_identifiers(StringIO(), {'metadataPrefix': 'oai_dc', 'verb': 'ListRecords'})
        t = time.time() - t0
        if t > self.number_of_records * allowed_seconds_per_record_oai:
            self.fail("""Response for ListRecords with metadataPrefix=oai_dc took too much time:
%s seconds.
Limit: %s seconds""" % (t, self.number_of_records * allowed_seconds_per_record_oai))
 def test_verbs(self):
     """oairepository - testing verbs"""
     self.assertNotEqual(None, re.search("Identify", oai_repository_server.oai_identify({'verb': 'Identify'})))
     ret = StringIO()
     oai_repository_server.oai_list_records_or_identifiers(ret, {'verb': 'ListIdentifiers', 'metadataPrefix': 'marcxml'})
     self.assertNotEqual(None, re.search("ListIdentifiers", ret.getvalue()))
     ret = StringIO()
     oai_repository_server.oai_list_records_or_identifiers(ret, {'verb': 'ListRecords', 'metadataPrefix': 'marcxml'})
     self.assertNotEqual(None, re.search("ListRecords", ret.getvalue()))
     self.assertNotEqual(None, re.search("ListMetadataFormats", oai_repository_server.oai_list_metadata_formats({'verb': 'ListMetadataFormats'})))
     self.assertNotEqual(None, re.search("ListSets", oai_repository_server.oai_list_sets({'verb': 'ListSets'})))
     self.assertNotEqual(None, re.search("GetRecord", oai_repository_server.oai_get_record({'identifier': 'oai:atlantis.cern.ch:1', 'verb': 'GetRecord'})))
Ejemplo n.º 8
0
 def test_hidden_fields(self):
     """oairepository - not exposing hidden fields"""
     req = StringIO()
     # List available records, get datestamps and play with them
     oai_repository_server.oai_list_records_or_identifiers(
         req, {
             'verb': 'GetRecord',
             'metadataPrefix': 'marcxml',
             'identifier': 'oai:atlantis.cern.ch:12'
         })
     result = req.getvalue()
     self.failIf("<error" in result, "Errors found in result: %s" % result)
     self.failIf('<marc:datafield tag="595" ind1=" " ind2=" " >' in result,
                 "Hidden field 595 found in result: %s" % result)
    def test_from_and_until(self):
        """oairepository - testing selective harvesting with 'from' and 'until' parameters"""

        req = StringIO()
        # List available records, get datestamps and play with them
        oai_repository_server.oai_list_records_or_identifiers(req, {'verb': 'ListIdentifiers', 'metadataPrefix': 'marcxml'})
        identifiers = req.getvalue()
        datestamps = re.findall('<identifier>(?P<id>.*?)</identifier>\s*<datestamp>(?P<date>.*?)</datestamp>', identifiers, re.M)

        sample_datestamp = datestamps[0][1] # Take one datestamp
        sample_oai_id = datestamps[0][0] # Take corresponding oai id
        sample_id = search_engine.perform_request_search(p=sample_oai_id,
                                                         f=CFG_OAI_ID_FIELD)[0] # Find corresponding system number id

        # There must be some datestamps
        self.assertNotEqual([], datestamps)

        # We must be able to retrieve an id with the date we have just found
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp), "%s not in %s (fromdate=%s)" % (sample_id, oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp), sample_datestamp))
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp), "%s not in %s" % (sample_id, oai_repository_server.oai_get_recid_list(untildate=sample_datestamp)))
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp, \
                                                                 fromdate=sample_datestamp))

        # Same, with short format date. Eg 2007-12-13
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp.split('T')[0]))
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp.split('T')[0]))
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp.split('T')[0], \
                                                                 untildate=sample_datestamp.split('T')[0]))

        # At later date (year after) we should not find our id again
        sample_datestamp_year = int(sample_datestamp[0:4])
        sample_datestamp_rest = sample_datestamp[4:]
        later_datestamp = str(sample_datestamp_year + 1) + sample_datestamp_rest
        self.assert_(sample_id not in oai_repository_server.oai_get_recid_list(fromdate=later_datestamp))

        # At earlier date (year before) we should not find our id again
        earlier_datestamp = str(sample_datestamp_year - 1) + sample_datestamp_rest
        self.assert_(sample_id not in oai_repository_server.oai_get_recid_list(untildate=earlier_datestamp))

        # From earliest date to latest date must include all oai records
        dates = [(time.mktime(time.strptime(date[1], "%Y-%m-%dT%H:%M:%SZ")), date[1]) for date in datestamps]
        dates = dict(dates)
        sorted_times = dates.keys()
        sorted_times.sort()
        earliest_datestamp = dates[sorted_times[0]]
        latest_datestamp = dates[sorted_times[-1]]
        self.assertEqual(oai_repository_server.oai_get_recid_list(), \
                         oai_repository_server.oai_get_recid_list(fromdate=earliest_datestamp, \
                                                            untildate=latest_datestamp))
Ejemplo n.º 10
0
    def test_response_speed_oai(self):
        """oairepository - speed of response for oai_dc output"""
        allowed_seconds_per_record_oai = 0.02

        # Test oai ListRecords performance
        t0 = time.time()
        oai_repository_server.oai_list_records_or_identifiers(
            StringIO(), {
                'metadataPrefix': 'oai_dc',
                'verb': 'ListRecords'
            })
        t = time.time() - t0
        if t > self.number_of_records * allowed_seconds_per_record_oai:
            self.fail(
                """Response for ListRecords with metadataPrefix=oai_dc took too much time:
%s seconds.
Limit: %s seconds""" %
                (t, self.number_of_records * allowed_seconds_per_record_oai))
Ejemplo n.º 11
0
    def test_response_speed_marcxml(self):
        """oairepository - speed of response for marcxml output"""
        allowed_seconds_per_record_marcxml = 0.05

        # Test marcxml ListRecords performance
        t0 = time.time()
        oai_repository_server.oai_list_records_or_identifiers(
            StringIO(),
            argd={
                'metadataPrefix': 'marcxml',
                'verb': 'ListRecords'
            })
        t = time.time() - t0
        if t > self.number_of_records * allowed_seconds_per_record_marcxml:
            self.fail(
                """Response for ListRecords with metadataPrefix=marcxml took too much time:\n
%s seconds.
Limit: %s seconds""" % (t, self.number_of_records *
                        allowed_seconds_per_record_marcxml))
Ejemplo n.º 12
0
 def test_touching_set(self):
     """oairepository - touch a set"""
     req = StringIO()
     oai_repository_server.oai_list_records_or_identifiers(
         req, {
             'verb': 'ListIdentifiers',
             'metadataPrefix': 'marcxml',
             'set': 'cern:experiment'
         })
     response = req.getvalue()
     current_timestamps = re.findall("<datestamp>(.*?)</datestamp>",
                                     response)
     current_timestamps = [
         datetime(*time.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')[:-3])
         for timestamp in current_timestamps
     ]
     last_timestamp = max(current_timestamps)
     future_timestamp = last_timestamp + timedelta(
         0, 5)  ## 5 seconds in the future to the last record
     future_timestamp = future_timestamp.strftime('%Y-%m-%dT%H:%M:%SZ')
     req = StringIO()
     oai_repository_server.oai_list_records_or_identifiers(
         req, {
             'verb': 'ListIdentifiers',
             'metadataPrefix': 'marcxml',
             'set': 'cern:experiment',
             'from': future_timestamp
         })
     response = req.getvalue()
     self.failIf(re.findall("<datestamp>(.*?)</datestamp>", response))
     from invenio.oai_repository_admin import touch_oai_set
     touch_oai_set('cern:experiment')
     req = StringIO()
     oai_repository_server.oai_list_records_or_identifiers(
         req, {
             'verb': 'ListIdentifiers',
             'metadataPrefix': 'marcxml',
             'set': 'cern:experiment',
             'from': future_timestamp
         })
     response = req.getvalue()
     new_timestamps = re.findall("<datestamp>(.*?)</datestamp>", response)
     new_timestamps = [
         datetime(*time.strptime(timestamp, '%Y-%m-%dT%H:%M:%SZ')[:-3])
         for timestamp in new_timestamps
     ]
     self.assertEqual(
         len(new_timestamps), len(current_timestamps),
         "new %s, old %s, from: %s" %
         (new_timestamps, current_timestamps, future_timestamp))
     self.failUnless(new_timestamps > current_timestamps)
    def test_touching_set(self):
        """oairepository - touch a set"""
        req = StringIO()
        oai_repository_server.oai_list_records_or_identifiers(
            req, {"verb": "ListIdentifiers", "metadataPrefix": "marcxml", "set": "cern:experiment"}
        )
        response = req.getvalue()
        current_timestamps = re.findall("<datestamp>(.*?)</datestamp>", response)
        current_timestamps = [
            datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:-3]) for timestamp in current_timestamps
        ]
        last_timestamp = max(current_timestamps)
        future_timestamp = last_timestamp + timedelta(0, 5)  ## 5 seconds in the future to the last record
        future_timestamp = future_timestamp.strftime("%Y-%m-%dT%H:%M:%SZ")
        req = StringIO()
        oai_repository_server.oai_list_records_or_identifiers(
            req,
            {
                "verb": "ListIdentifiers",
                "metadataPrefix": "marcxml",
                "set": "cern:experiment",
                "from": future_timestamp,
            },
        )
        response = req.getvalue()
        self.failIf(re.findall("<datestamp>(.*?)</datestamp>", response))
        from invenio.oai_repository_admin import touch_oai_set

        touch_oai_set("cern:experiment")
        req = StringIO()
        oai_repository_server.oai_list_records_or_identifiers(
            req,
            {
                "verb": "ListIdentifiers",
                "metadataPrefix": "marcxml",
                "set": "cern:experiment",
                "from": future_timestamp,
            },
        )
        response = req.getvalue()
        new_timestamps = re.findall("<datestamp>(.*?)</datestamp>", response)
        new_timestamps = [
            datetime(*time.strptime(timestamp, "%Y-%m-%dT%H:%M:%SZ")[:-3]) for timestamp in new_timestamps
        ]
        self.assertEqual(
            len(new_timestamps),
            len(current_timestamps),
            "new %s, old %s, from: %s" % (new_timestamps, current_timestamps, future_timestamp),
        )
        self.failUnless(new_timestamps > current_timestamps)
Ejemplo n.º 14
0
    def test_from_and_until(self):
        """oairepository - testing selective harvesting with 'from' and 'until' parameters"""

        req = StringIO()
        # List available records, get datestamps and play with them
        oai_repository_server.oai_list_records_or_identifiers(
            req, {
                'verb': 'ListIdentifiers',
                'metadataPrefix': 'marcxml'
            })
        identifiers = req.getvalue()
        datestamps = re.findall(
            '<identifier>(?P<id>.*?)</identifier>\s*<datestamp>(?P<date>.*?)</datestamp>',
            identifiers, re.M)

        sample_datestamp = datestamps[0][1]  # Take one datestamp
        sample_oai_id = datestamps[0][0]  # Take corresponding oai id
        sample_id = search_engine.perform_request_search(
            p=sample_oai_id,
            f=CFG_OAI_ID_FIELD)[0]  # Find corresponding system number id

        # There must be some datestamps
        self.assertNotEqual([], datestamps)

        # We must be able to retrieve an id with the date we have just found
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(
            fromdate=sample_datestamp))
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(
            untildate=sample_datestamp))
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(untildate=sample_datestamp, \
                                                                 fromdate=sample_datestamp))

        # Same, with short format date. Eg 2007-12-13
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(
            fromdate=sample_datestamp.split('T')[0]))
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(
            untildate=sample_datestamp.split('T')[0]))
        self.assert_(sample_id in oai_repository_server.oai_get_recid_list(fromdate=sample_datestamp.split('T')[0], \
                                                                 untildate=sample_datestamp.split('T')[0]))

        # At later date (year after) we should not find our id again
        sample_datestamp_year = int(sample_datestamp[0:4])
        sample_datestamp_rest = sample_datestamp[4:]
        later_datestamp = str(sample_datestamp_year +
                              1) + sample_datestamp_rest
        self.assert_(sample_id not in oai_repository_server.oai_get_recid_list(
            fromdate=later_datestamp))

        # At earlier date (year before) we should not find our id again
        earlier_datestamp = str(sample_datestamp_year -
                                1) + sample_datestamp_rest
        self.assert_(sample_id not in oai_repository_server.oai_get_recid_list(
            untildate=earlier_datestamp))

        # From earliest date to latest date must include all oai records
        dates = [(time.mktime(time.strptime(date[1],
                                            "%Y-%m-%dT%H:%M:%SZ")), date[1])
                 for date in datestamps]
        dates = dict(dates)
        sorted_times = dates.keys()
        sorted_times.sort()
        earliest_datestamp = dates[sorted_times[0]]
        latest_datestamp = dates[sorted_times[-1]]
        self.assertEqual(oai_repository_server.oai_get_recid_list(), \
                         oai_repository_server.oai_get_recid_list(fromdate=earliest_datestamp, \
                                                            untildate=latest_datestamp))
    def __call__(self, req, form):
        """OAI repository interface"""

        # Clean input arguments. The protocol specifies that an error
        # has to be returned if the same argument is specified several
        # times. Eg:
        # oai2d?verb=ListIdentifiers&metadataPrefix=marcxml&metadataPrefix=marcxml
        # So keep the arguments as list for now so that check_argd can
        # return an error if needed (check_argd also transforms these
        # lists into strings)
        argd = wash_urlargd(form, {'verb': (list, []),
                                   'metadataPrefix': (list, []),
                                   'from': (list, []),
                                   'until': (list, []),
                                   'set': (list, []),
                                   'identifier': (list, []),
                                   'resumptionToken': (list, []),
                                   })

        if CFG_VALIDATE_RESPONSES:
            req.track_writings = True

        ## wash_urlargd(..) function cleaned everything, but also added
        ## unwanted parameters. Remove them now
        for param in argd.keys():
            if not param in form and param != 'verb':
                del argd[param]

        ## wash_urlargd(..) function also removed unknown parameters
        ## that we would like to keep in order to send back an error
        ## as required by the protocol. But we do not need that value,
        ## so set it to empty string.
        for param in form.keys():
            if param not in argd.keys():
                argd[param] = ''

        ## But still remove 'ln' parameter that was automatically added.
        if argd.has_key('ln'):
            del argd['ln']

        ## check request for OAI compliancy
        ## also transform all the list arguments into string
        oai_errors = oai_repository_server.check_argd(argd)

        ## check availability (OAI requests for Identify, ListSets and
        ## ListMetadataFormats are served immediately, otherwise we
        ## shall wait for CFG_OAI_SLEEP seconds between requests):
        if os.path.exists("%s/RTdata/RTdata" % CFG_CACHEDIR) and (argd['verb'] not in ["Identify", "ListMetadataFormats", "ListSets"] and not argd.get('resumptionToken')):
            time_gap = int(time.time() - os.path.getmtime("%s/RTdata/RTdata" % CFG_CACHEDIR))
            if(time_gap < CFG_OAI_SLEEP):
                req.headers_out["Status-Code"] = "503"
                req.headers_out["Retry-After"] = "%d" % (CFG_OAI_SLEEP - time_gap)
                req.status = apache.HTTP_SERVICE_UNAVAILABLE
                return "Retry after %d seconds" % (CFG_OAI_SLEEP - time_gap)
        command = "touch %s/RTdata/RTdata" % CFG_CACHEDIR
        os.system(command)


        ## create OAI response
        req.content_type = "text/xml"
        req.send_http_header()

        if not oai_errors:
            ## OAI Identify
            if argd['verb']   == "Identify":
                req.write(oai_repository_server.oai_identify(argd))

            ## OAI ListSets
            elif argd['verb'] == "ListSets":
                req.write(oai_repository_server.oai_list_sets(argd))

            ## OAI ListIdentifiers or OAI ListRecords
            elif argd['verb'] in ("ListIdentifiers", "ListRecords"):
                oai_repository_server.oai_list_records_or_identifiers(req, argd)

            ## OAI GetRecord
            elif argd['verb'] == "GetRecord":
                req.write(oai_repository_server.oai_get_record(argd))

            ## OAI ListMetadataFormats
            elif argd['verb'] == "ListMetadataFormats":
                req.write(oai_repository_server.oai_list_metadata_formats(argd))

            ## Unknown verb

        ## OAI error
        else:
            req.write(oai_repository_server.oai_error(argd, oai_errors))

        if CFG_VALIDATE_RESPONSES:
            req.track_writings = False
            try:
                OAI_PMH_VALIDATOR.assertValid(etree.parse(cStringIO.StringIO(req.what_was_written)))
            except etree.DocumentInvalid:
                register_exception(req=req, alert_admin=True)
                raise
        return "\n"
    def __call__(self, req, form):
        """OAI repository interface"""

        # Clean input arguments. The protocol specifies that an error
        # has to be returned if the same argument is specified several
        # times. Eg:
        # oai2d?verb=ListIdentifiers&metadataPrefix=marcxml&metadataPrefix=marcxml
        # So keep the arguments as list for now so that check_argd can
        # return an error if needed (check_argd also transforms these
        # lists into strings)
        argd = wash_urlargd(form, {'verb': (list, []),
                                   'metadataPrefix': (list, []),
                                   'from': (list, []),
                                   'until': (list, []),
                                   'set': (list, []),
                                   'identifier': (list, []),
                                   'resumptionToken': (list, []),
                                   })

        if CFG_VALIDATE_RESPONSES:
            req.track_writings = True

        ## wash_urlargd(..) function cleaned everything, but also added
        ## unwanted parameters. Remove them now
        for param in argd.keys():
            if not param in form and param != 'verb':
                del argd[param]

        ## wash_urlargd(..) function also removed unknown parameters
        ## that we would like to keep in order to send back an error
        ## as required by the protocol. But we do not need that value,
        ## so set it to empty string.
        for param in form.keys():
            if param not in argd.keys():
                argd[param] = ''

        ## But still remove 'ln' parameter that was automatically added.
        if argd.has_key('ln'):
            del argd['ln']

        ## check request for OAI compliancy
        ## also transform all the list arguments into string
        oai_errors = oai_repository_server.check_argd(argd)

        ## check availability (OAI requests for Identify, ListSets and
        ## ListMetadataFormats are served immediately, otherwise we
        ## shall wait for CFG_OAI_SLEEP seconds between requests):
        if os.path.exists("%s/RTdata/RTdata" % CFG_CACHEDIR) and (argd['verb'] not in ["Identify", "ListMetadataFormats", "ListSets"] and not argd.get('resumptionToken')):
            time_gap = int(time.time() - os.path.getmtime("%s/RTdata/RTdata" % CFG_CACHEDIR))
            if(time_gap < CFG_OAI_SLEEP):
                req.headers_out["Status-Code"] = "503"
                req.headers_out["Retry-After"] = "%d" % (CFG_OAI_SLEEP - time_gap)
                req.status = apache.HTTP_SERVICE_UNAVAILABLE
                return "Retry after %d seconds" % (CFG_OAI_SLEEP - time_gap)
        command = "touch %s/RTdata/RTdata" % CFG_CACHEDIR
        os.system(command)


        ## create OAI response
        req.content_type = "text/xml"
        req.send_http_header()

        if not oai_errors:
            ## OAI Identify
            if argd['verb']   == "Identify":
                req.write(oai_repository_server.oai_identify(argd))

            ## OAI ListSets
            elif argd['verb'] == "ListSets":
                req.write(oai_repository_server.oai_list_sets(argd))

            ## OAI ListIdentifiers or OAI ListRecords
            elif argd['verb'] in ("ListIdentifiers", "ListRecords"):
                oai_repository_server.oai_list_records_or_identifiers(req, argd)

            ## OAI GetRecord
            elif argd['verb'] == "GetRecord":
                req.write(oai_repository_server.oai_get_record(argd))

            ## OAI ListMetadataFormats
            elif argd['verb'] == "ListMetadataFormats":
                req.write(oai_repository_server.oai_list_metadata_formats(argd))

            ## Unknown verb

        ## OAI error
        else:
            req.write(oai_repository_server.oai_error(argd, oai_errors))

        if CFG_VALIDATE_RESPONSES:
            req.track_writings = False
            try:
                OAI_PMH_VALIDATOR.assertValid(etree.parse(cStringIO.StringIO(req.what_was_written)))
            except etree.DocumentInvalid:
                register_exception(req=req, alert_admin=True)
                raise
        return "\n"