Example #1
0
 def _oai2d_endpoint_identifiers(self):
     """Return a set of the Community OAI Set recids from OAI endpoint."""
     with patch('sickle.app.requests.get', new=sickle_requests_get_mock()):
         sickle = Sickle('http://auditor/oai2d')
         ids = sickle.ListIdentifiers(set=self.community.oaiset_spec,
                                      metadataPrefix='oai_dc')
         return {int(i.identifier.rsplit(':', 1)[-1]) for i in ids}
Example #2
0
File: utils.py Project: llcit/llt
    def test_list_oai_collections(self, community):
        """ Constructs list of tuples of collections (a seconday grouping concept
        in OAI) "owned" by the given community.
        
        Utilizes OAI-PMH verbs: ListIdentifiers and ListSets
        """
        sickle = Sickle(community.repository.base_url)

        # Retrieve collections associated with community parameter
        record_headers = sickle.ListIdentifiers(metadataPrefix='oai_dc',
                                                set=community.identifier)
        # Filter record headers to build collection map from the community
        community_collections = {}
        for i in record_headers:
            # Iterate over associated sets looking for collections
            for j in i.setSpecs:
                if j[:3] == 'col':
                    community_collections[
                        j] = None  # register collection id in map

        # Map names to ids in collection map {setSpec: setName}
        # listsets oai request returns the 'setName' of the collection in metadata...
        for i in sickle.ListSets():
            modstr = 'col' + i.setSpec[
                3:]  # Bug in oai? in set results a 'collection' has a prefix of 'com'!
            if modstr in community_collections:  # checks for a mapped collection identifier
                community_collections[modstr] = i.setName

        # Convert map to list of tuples
        self.collections = community_collections.items()

        # Sort collections by name
        self.collections = sorted(self.collections, key=lambda i: i[1])
        return self.collections
Example #3
0
def listIdentifiers(request):
    """
    POST http://localhost/oai_pmh/api/listidentifiers
    POST data query='{"url":"value", "metadataprefix":"value"}' optional {"set":"value"}
    """
    try:
        serializer = RegistryURLSerializer(data=request.DATA)
        if serializer.is_valid():
            url = request.DATA['url']
            metadataprefix = request.DATA['metadataprefix']
            setH = request.DATA.get('set', None)
            sickle = Sickle(url)
            rsp = sickle.ListIdentifiers(metadataPrefix=metadataprefix, set=setH)
            rtn = []
            try:
                while True:
                    rtn.append( dict(rsp.next()) )
            except StopIteration:
                pass

            serializer = ListIdentifierSerializer(rtn)
            return Response(serializer.data, status=status.HTTP_200_OK)
        else:
            raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST)
    except OAIAPIException as e:
        return e.response()
    except Exception as e:
        content = APIMessage.getMessageLabelled('An error occurred when attempting to identify resource: %s'%e.message)
        return Response(content, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
    def generate(self):
        """Returns a list of ResourceSync resources that each represent one
        full OAI-PMH record (i.e., the result of a GetRecord request).
        """

        provider = Sickle(self.params['oaipmh_base_url'])
        headers = provider.ListIdentifiers(
            ignore_deleted=True,
            set=self.params['oaipmh_set'],
            metadataPrefix=self.params['oaipmh_metadataprefix'])

        return list(map(self.oaipmh_header_to_resourcesync_resource, headers))
Example #5
0
def harvest(host, from_date, until, format, out, set, verbose):
    counter = 0

    if verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    logging.info("OAI-PMH harvesting from %s", host)
    logging.info("From date = %s", from_date)
    logging.info("Until date = %s", until)
    logging.info("Metadata format = %s", format)
    logging.info("Outfile = %s", out)

    mysickle = Sickle(host, iterator=OAIItemIterator)
    params = {'metadataPrefix': format, 'from': from_date, 'until': until}
    if set is not None:
        params['set'] = set
    try:
        responses = mysickle.ListIdentifiers(**params)
    except NoRecordsMatch:
        logging.info("No records harvested: the combination of the values of "
                     "the arguments results in an empty list.")
        sys.exit()

    identifier_list = []

    for records in responses:
        identifier_list.append(records.identifier)

    logging.info(f"Identifier count to harvest: {len(identifier_list)}")

    with open(out, 'wb') as f:
        f.write('<records>'.encode())

        for identifier in identifier_list:
            r = mysickle.GetRecord(identifier=identifier,
                                   metadataPrefix=format)
            f.write(r.raw.encode('utf8'))
            logging.debug(counter)
            logging.debug(r.raw)
            counter += 1

        f.write('</records>'.encode())

    logging.info("Total records harvested: %i", counter)
Example #6
0
class TestCase(unittest.TestCase):
    def __init__(self, methodName='runTest'):
        super(TestCase, self).__init__(methodName)
        self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest)

    def setUp(self):
        self.patch.start()
        self.sickle = Sickle('http://localhost')

    def tearDown(self):
        self.patch.stop()

    def test_OAIResponse(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       metadataPrefix='oai_dc')
        self.assertIsInstance(response.xml, etree._Element)
        self.assertIsInstance(response.raw, string_types)

    def test_broken_XML(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       resumptionToken='ListRecordsBroken.xml')
        self.assertEqual(response.xml, None)
        self.assertIsInstance(response.raw, string_types)

    def test_ListRecords(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc')
        assert len([r for r in records]) == 8

    def test_ListRecords_ignore_deleted(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc',
                                          ignore_deleted=True)
        num_records = len([r for r in records])
        assert num_records == 4

    def test_ListSets(self):
        set_iterator = self.sickle.ListSets()
        sets = [s for s in set_iterator]
        self.assertEqual(131, len(sets))
        dict(sets[0])

    def test_ListMetadataFormats(self):
        mdf_iterator = self.sickle.ListMetadataFormats()
        mdfs = [mdf for mdf in mdf_iterator]
        self.assertEqual(5, len(mdfs))
        dict(mdfs[0])

    def test_ListIdentifiers(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc')
        self.assertEqual(len([r for r in records]), 4)

    def test_ListIdentifiers_ignore_deleted(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc',
                                              ignore_deleted=True)
        # There are 2 deleted headers in the test data
        num_records = len([r for r in records])
        self.assertEqual(num_records, 2)

    def test_Identify(self):
        identify = self.sickle.Identify()
        self.assertTrue(hasattr(identify, 'repositoryName'))
        self.assertTrue(hasattr(identify, 'baseURL'))
        self.assertTrue(hasattr(identify, 'adminEmail'))
        self.assertTrue(hasattr(identify, 'earliestDatestamp'))
        self.assertTrue(hasattr(identify, 'deletedRecord'))
        self.assertTrue(hasattr(identify, 'granularity'))
        self.assertTrue(hasattr(identify, 'description'))
        self.assertTrue(hasattr(identify, 'oai_identifier'))
        self.assertTrue(hasattr(identify, 'sampleIdentifier'))
        dict(identify)

    def test_GetRecord(self):
        oai_id = 'oai:test.example.com:1996652'
        record = self.sickle.GetRecord(identifier=oai_id)
        self.assertEqual(record.header.identifier, oai_id)
        self.assertIn(oai_id, record.raw)
        self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z')
        self.assertIsInstance(record.xml, etree._Element)
        binary_type(record)
        text_type(record)
        dict(record.header)
        self.assertEqual(dict(record), record.metadata)

    # Test OAI-specific exceptions

    @raises(BadArgument)
    def test_badArgument(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc', error='badArgument')

    @raises(CannotDisseminateFormat)
    def test_cannotDisseminateFormat(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='cannotDisseminateFormat')

    @raises(IdDoesNotExist)
    def test_idDoesNotExist(self):
        self.sickle.GetRecord(metadataPrefix='oai_dc', error='idDoesNotExist')

    @raises(NoSetHierarchy)
    def test_noSetHierarchy(self):
        self.sickle.ListSets(metadataPrefix='oai_dc', error='noSetHierarchy')

    @raises(BadResumptionToken)
    def test_badResumptionToken(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='badResumptionToken')

    @raises(NoRecordsMatch)
    def test_noRecordsMatch(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='noRecordsMatch')

    @raises(OAIError)
    def test_undefined_OAI_error_XML(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='undefinedError')

    def test_OAIResponseIterator(self):
        sickle = Sickle('fake_url', iterator=OAIResponseIterator)
        records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')]
        self.assertEqual(len(records), 4)
Example #7
0
def get_identifiers_in_set(setSpec):
    sickle = Sickle(admin.get_repository_url())
    return sickle.ListIdentifiers(
        **{ 'metadataPrefix': 'oai_dc',
            'set': setSpec,
    })
Example #8
0
from oaiharvests.utils import *
from oaiharvests.models import *
from sickle import Sickle

# com = Community.objects.all()[0]

# oai = OAIUtils()
# oai.list_oai_collections(com)

base_url = 'http://scholarspace.manoa.hawaii.edu/dspace-oai/request'
llt_id = 'com_10125_27123'

s = Sickle(base_url)

record_headers = list(s.ListIdentifiers(metadataPrefix='oai_dc', set=llt_id))

community_collections = {}
for i in record_headers:
    # Iterate over associated sets looking for collections
    for j in i.setSpecs:
        if j[:3] == 'col':
            community_collections[j] = None  # register id in map

for i in s.ListSets():
    try:
        print community_collections[i.setSpec]
        community_collections[i.setSpec] = i.setName
        print i.setSpec, '==>', community_collections[i.setSpec]
        print i
    except KeyError as e:
Example #9
0
class OAIHarvester(Harvester):
    def __init__(self, community, url, oai_metadata_prefix, oai_set, fromdate,
                 clean, limit, outdir, verify):
        super().__init__(community, url, fromdate, clean, limit, outdir,
                         verify)
        logging.captureWarnings(True)
        self.mdprefix = oai_metadata_prefix
        self.oai_set = oai_set
        self.sickle = Sickle(self.url,
                             max_retries=3,
                             timeout=120,
                             verify=self.verify)

    def identifier(self, record):
        return record.header.identifier

    def matches(self):
        try:
            records = self.sickle.ListIdentifiers(
                **{
                    'metadataPrefix': self.mdprefix,
                    'set': self.oai_set,
                    'ignore_deleted': True,
                    'from': self.fromdate,
                })
            # TODO: complete_list_size is not always set by OAI
            matches = int(records.resumption_token.complete_list_size)
        except Exception:
            logging.warning('Could not get complete list size from OAI.')
            matches = super().matches()
        return matches

    def check_metadata_format(self):
        md_formats = None
        try:
            md_formats = [
                f.metadataPrefix for f in self.sickle.ListMetadataFormats()
            ]
        except Exception:
            logging.warning(
                "OAI does not support ListMetadataFormats request.")
        if md_formats and self.mdprefix not in md_formats:
            logging.error(
                f'The metadata format {self.mdprefix} is not supported by the OAI repository. Formats={md_formats}'
            )

    def get_records(self):
        self.check_metadata_format()
        # NOTE: use dict args to pass "from" parameter
        # https://sickle.readthedocs.io/en/latest/tutorial.html#using-the-from-parameter
        try:
            records = self.sickle.ListRecords(
                **{
                    'metadataPrefix': self.mdprefix,
                    'set': self.oai_set,
                    'ignore_deleted': True,
                    'from': self.fromdate,
                })
            for record in records:
                yield record
        except NoRecordsMatch:
            logging.warning(
                f'No records match the OAI query. from={self.fromdate}')
        except CannotDisseminateFormat:
            raise HarvesterError(
                f'The metadata format {self.mdprefix} is not supported by the OAI repository.'
            )

    def _write_record(self, fp, record, pretty_print=True):
        xml = etree.tostring(record.xml,
                             pretty_print=pretty_print).decode('utf8')
        fp.write(xml)