Esempio n. 1
0
def main():

    sickle = Sickle('http://arizona.openrepository.com/arizona/oai/request?')
    # sets = sickle.ListSets()

    recs = sickle.ListRecords(**{'metadataPrefix':'oai_dc','set':'com_10150_129649','from':'2017-04-05'})
    # log.debug("Making request to {}".format(recs))
    # try:
    #     response = recs
    # except Exception as e:
    #     log.exception("An error occured in issuing the request!")
    #     raise
    # log.debug("Request completed")
    # # log.debug("Response Code: {}".format(response.status_code))
    # # log.debug("Response text: {}".format(response.text))
    # log.debug("Trying to convert response to JSON...")
    # try:
    #     response = response
    #     log.debug("Response successfully converted to JSON: {}".format(response))
    # except Exception as e:
    #     log.exception("An error occured!")
    #     raise


    # print(recs.url)
    newFile.write('<?xml version="1.0" encoding="utf-8"?>')
    newFile.write('<OAI-PMH xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:OAI-PMH="http://www.openarchives.org/OAI/2.0/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">')
    for r in recs:

        newR = str(r).encode('utf8')
        newR = newR.decode('utf8')
        newFile.write(str(newR))
    newFile.write('</OAI-PMH>')
Esempio n. 2
0
def get_record_metadata(repository_url, identifier):
    sickle = Sickle(repository_url)
    rec = sickle.GetRecord(
        identifier=identifier,
        metadataPrefix='oai_dc'
    )
    return rec.metadata
Esempio n. 3
0
def download_oai_dc(outfile=None,
                    base_url='http://bibliotheque-numerique.inha.fr/oai',
                    sets=16800,
                    force_refresh=False):
    """Downloads a set from a oai-pmh repository and returns it
    if given an outfile, save the resuls to it,
    will also use it as a cache if needed
    """
    if os.path.exists(outfile) and not force_refresh:
        return read_json_gzip(outfile)

    sickle = Sickle(base_url)
    records = sickle.ListRecords(**{
        'metadataPrefix': 'oai_dc',
        'set': "oai:sets:%d" % sets
    })

    records_fetched = list()
    i = 0
    for record in tqdm(records):
        if i == 100:
            break
        records_fetched.append(record.metadata)
        i += 1
    records_fetched = records_fetched
    if outfile:
        write_json_gzip(outfile, records_fetched)

    return records_fetched
Esempio n. 4
0
 def _get_random_configuration(self):
     self.servers = TestDynamicListRecords._get_servers()
     test_key = 'TEST'
     configuration = {'contexts': {test_key: {}}}
     servers = random.choices(self.servers,
                              k=random.randint(1, min(3,
                                                      len(self.servers))))
     i = 0
     for server in servers:
         server_key = 'SERVER' + str(i)
         configuration['contexts'][test_key][server_key] = {'url': server}
         set_names = []
         server_sickle = Sickle(server)
         try:
             for set_name in server_sickle.ListSets():
                 if len(set_names) == 20:
                     break
                 set_names.append(set_name.setSpec)
         except:
             continue
         sets = set(
             random.choices(set_names,
                            k=random.randint(0, min(20, len(set_names)))))
         if len(sets) > 0:
             configuration['contexts'][test_key][server_key]['sets'] = []
         for set_name in sets:
             configuration['contexts'][test_key][server_key]['sets'].append(
                 set_name)
         i += 1
     return configuration
 def run(self):
     timestamp = datetime.utcnow()
     sickle = Sickle('http://invenio.nusl.cz/oai2d/')
     sickle.class_mapping['ListRecords'] = MarcXMLParser
     sickle.class_mapping['GetRecord'] = MarcXMLParser
     oai_logger.info("Loading records")
     records = sickle.ListRecords(metadataPrefix='marcxml')
     for idx, record in enumerate(records):
         print(f"{idx}. {record.id}")
         oai_logger.info(f"{idx}. {record.id}")
         try:
             current_search_client.index(
                 index=self.index,
                 id=record.marc_dict["001"],
                 body=record.marc_dict
             )
         except:
             exc_traceback = traceback.format_exc()
             print(exc_traceback)
             print("\n\n\n")
             file_name = f'{timestamp.strftime("%Y%m%dT%H%M%S")}.err'
             file_path = os.path.join(self.path, file_name)
             with open(file_path, "a") as f:
                 f.write(
                     f"Dictionary: {record.marc_dict}\n\n"
                     f"{exc_traceback}\n\n\n\n")
             continue
Esempio n. 6
0
    def fetch_oai_recs_day(self, date):
        """
        Generator that returns the key and full record
        of works deposited on a particular day.

        :param day:
        :return: str key, str rec
        """
        api = Sickle(self.endpoint_url)
        date_str = date.isoformat()
        # this dict kwargs hack is to work around 'from' as a reserved python keyword
        # recommended by sickle docs
        # Question: Why would someone insist on using a keyword for a parameter?
        try:
            records = api.ListRecords(
                **{
                    'metadataPrefix': self.metadata_prefix,
                    'from': date_str,
                    'until': date_str,
                })
        except oaiexceptions.NoRecordsMatch:
            records = []
            _LOGGER.info('OAI request produced no records.')

        for item in records:
            yield item.header.identifier.encode('utf-8'), item.raw.encode(
                'utf-8')
Esempio n. 7
0
def getData(request):
    """
    POST http://localhost/oai_pmh/api/getdata/
    POST data query='{"url":"value"}'
    """
    try:
        serializer = IdentifySerializer(data=request.DATA)
        if serializer.is_valid():
            url = request.POST['url']
            if str(url).__contains__('?'):
                registryURl = str(url).split('?')[0]
                #Check if the OAI Registry is available
                sickle = Sickle(registryURl)
                sickle.Identify()
                http_response = requests.get(url)
                if http_response.status_code == status.HTTP_200_OK:
                    return Response(http_response.text, status=status.HTTP_200_OK)
                else:
                    raise OAIAPIException(message='An error occurred.', status=http_response.status_code)
            else:
                raise OAIAPIException(message='An error occurred, url malformed.', status=status.HTTP_400_BAD_REQUEST)
        else:
            raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST)
    except requests.HTTPError, err:
        content = APIMessage.getMessageLabelled(err.message)
        return Response(content, status=err.response.status_code)
 def __init__(self,
              provider: OAIProvider,
              parser_name: str = None,
              unhandled_paths: set = None,
              validation: Callable = None,
              create_record: Callable = None,
              delete_record: Callable = None,
              update_record: Callable = None,
              pid_type: str = None,
              oai_identifiers: List[str] = None):
     super().__init__(provider)
     self.pid_type = pid_type
     self.provider = provider
     self.oai_sync = None
     self.sickle = Sickle(self.provider.oai_endpoint)
     registry.load()
     self.parsers = provider.get_parsers()
     self.rules = provider.get_rules(parser_name) or {}
     self.parser = self.parsers.get(parser_name) or {}
     self.transformer = OAITransformer(self.rules,
                                       unhandled_paths=unhandled_paths)
     self.validation_handler = validation
     self.create_record_handler = create_record
     self.update_record_handler = update_record
     self.delete_record_handler = delete_record
     self.oai_identifiers = oai_identifiers
Esempio n. 9
0
def listIdentifiers(request):
    """
    POST http://localhost/oai_pmh/api/listidentifiers
    POST data query='{"url":"value", "metadataprefix":"value"}' optional {"set":"value"}
    """
    try:
        serializer = RegistryURLSerializer(data=request.DATA)
        if serializer.is_valid():
            url = request.DATA['url']
            metadataprefix = request.DATA['metadataprefix']
            setH = request.DATA.get('set', None)
            sickle = Sickle(url)
            rsp = sickle.ListIdentifiers(metadataPrefix=metadataprefix, set=setH)
            rtn = []
            try:
                while True:
                    rtn.append( dict(rsp.next()) )
            except StopIteration:
                pass

            serializer = ListIdentifierSerializer(rtn)
            return Response(serializer.data, status=status.HTTP_200_OK)
        else:
            raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST)
    except OAIAPIException as e:
        return e.response()
    except Exception as e:
        content = APIMessage.getMessageLabelled('An error occurred when attempting to identify resource: %s'%e.message)
        return Response(content, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
Esempio n. 10
0
def getRecord(request):
    """
    POST http://localhost/oai_pmh/api/rest/getrecord
    POST data query='{"url":"value", "identifier":"value", "metadataprefix":"value"}'
    """
    try:
        serializer = GetRecordSerializer(data=request.DATA)
        if serializer.is_valid():
            url = request.DATA['url']
            identifier = request.DATA['identifier']
            metadataprefix = request.DATA['metadataprefix']
            sickle = Sickle(url)
            grResponse = sickle.GetRecord(metadataPrefix=metadataprefix, identifier=identifier)
            record = Record(grResponse.xml)
            rtn=[]
            rtn.append({"identifier": record.header.identifier,
                      "datestamp": record.header.datestamp,
                      "deleted": record.deleted,
                      "sets": record.header.setSpecs,
                      "metadataPrefix": metadataprefix,
                      "metadata": etree.tostring(record.xml.find('.//' + '{http://www.openarchives.org/OAI/2.0/}' +
                                                                 'metadata/')) if not record.deleted else None,
                      "raw": record.raw})

            serializer = RecordSerializer(rtn)
            return Response(serializer.data, status=status.HTTP_200_OK)
        else:
            raise OAIAPISerializeLabelledException(errors=serializer.errors, status=status.HTTP_400_BAD_REQUEST)
    except OAIAPIException as e:
        return e.response()
    except Exception as e:
        content = APIMessage.getMessageLabelled('An error occurred when attempting to retrieve record. %s'%e)
        return Response(content, status=status.HTTP_500_INTERNAL_SERVER_ERROR)
Esempio n. 11
0
    def __init__(self, config, query):
        oai_api_url = config['default']['oai_api_url']
        self.sickle = Sickle(oai_api_url)
        self.resumption_token = self.get_token()

        # Queried attribute. I.e. type, description, format, subject, etc.
        self.query_dict = query
Esempio n. 12
0
def list_sets_with_counts(repository_url):
    sickle = Sickle(repository_url)
    setlist = []
    listsets = sickle.ListSets()

    try:
        for i in range(500):
            s = listsets.next()
            #identifiers = get_identifiers_in_set(s.setSpec)
            cnt = 'Ha' #len(identifiers)

            set_identifiers = list_identifiers(s.setSpec)

            setlist.append(
                {
                    'setSpec': s.setSpec,
                    'setName': s.setName,
                    'set_identifiers': set_identifiers,
                }
            )
    except StopIteration:
        pass
    except Exception as e:
        abort(400, e)

    return setlist
Esempio n. 13
0
 def __init__(self, url_harvest, extra_data, **kwargs):
     super(OAIFetcher, self).__init__(url_harvest, extra_data, **kwargs)
     # TODO: check extra_data?
     self.oai_client = Sickle(self.url)
     self._metadataPrefix = self.get_metadataPrefix(extra_data)
     # ensure not cached in module?
     self.oai_client.class_mapping['ListRecords'] = SickleDCRecord
     self.oai_client.class_mapping['GetRecord'] = SickleDCRecord
     if extra_data:  # extra data is set spec
         if 'set' in extra_data:
             params = parse_qs(extra_data)
             self._set = params['set'][0]
         else:
             self._set = extra_data
         # if metadataPrefix=didl, use didlRecord for parsing
         if self._metadataPrefix.lower() == 'didl':
             self.oai_client.class_mapping['ListRecords'] = SickleDIDLRecord
             self.oai_client.class_mapping['GetRecord'] = SickleDIDLRecord
         elif self._metadataPrefix.lower() == 'marcxml':
             self.oai_client.class_mapping['ListRecords'] = SickleMARCRecord
             self.oai_client.class_mapping['GetRecord'] = SickleMARCRecord
         self.records = self.oai_client.ListRecords(
             metadataPrefix=self._metadataPrefix,
             set=self._set,
             ignore_deleted=True)
     else:
         self.records = self.oai_client.ListRecords(
             metadataPrefix=self._metadataPrefix, ignore_deleted=True)
Esempio n. 14
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--from_date', required=True)
    params = parser.parse_args()

    logging.basicConfig(level=logging.INFO,
                        format='[%(asctime)s] %(levelname)s %(message)s',
                        datefmt='%d/%b/%Y %H:%M:%S')

    oai_client = Sickle(endpoint=OAI_PMH_PREPRINT_ENDPOINT,
                        max_retries=3,
                        verify=False)
    records = oai_client.ListRecords(**{
        'metadataPrefix': OAI_METADATA_PREFIX,
        'from': params.from_date
    })

    logging.info('Obtendo dados do OAI-PMH Preprints para date >= %s' %
                 params.from_date)
    data = {}
    for r in records:
        data.update(parse(r))

    filename = ''.join([PREPRINT_DICTIONARY_PREFIX, params.from_date, '.json'])
    save(data, filename)
Esempio n. 15
0
File: utils.py Progetto: llcit/llt
def get_bitstream_url(collection, record_in):
    """ Harvests an href pointing to the bitstream urls for the record in repository.
    E.g., https://scholarspace.manoa.hawaii.edu/bitstream/10125/25006/1/editor.pdf
    """

    sickle = Sickle(collection.community.repository.base_url)
    sickle.class_mapping['GetRecord'] = LltRecordBitstream
    record = sickle.GetRecord(metadataPrefix='ore',
                              identifier=record_in.header.identifier)

    bitstreams = {'bitstream': None, 'bitstream_txt': None}

    try:
        bitstreams['bitstream'] = record.metadata['bitstream']
    except Exception as e:
        print(e, 'Unable to construct bitstream url for',
              record_in.header.identifier)

    try:
        bitstreams['bitstream_txt'] = record.metadata['bitstream_txt'][
            0].replace('+', '%20')
    except Exception as e:
        print(e, 'Unable to construct bitstream_txt url for',
              record_in.header.identifier)

    return bitstreams
Esempio n. 16
0
 def __init__(self, source=None, setspec=None, **kwargs):
     """Init the loader for remote OAI-PMH access."""
     super(RemoteOAIRELoader, self).__init__(
         source or current_app.config['OPENAIRE_OAIPMH_ENDPOINT'], **kwargs)
     self.client = Sickle(self.source)
     self.setspec = setspec or \
         current_app.config['OPENAIRE_OAIPMH_DEFAULT_SET'],
Esempio n. 17
0
 def test_override_encoding(self):
     mock_response = Mock(text='<xml/>')
     mock_get = Mock(return_value=mock_response)
     with patch('sickle.app.requests.get', mock_get):
         sickle = Sickle('url', encoding='encoding')
         sickle.ListSets()
         self.assertEqual(mock_response.encoding, 'encoding')
Esempio n. 18
0
 def descriptions(self):
     """Retrive descriptions and dumps it in cache file"""
     s = Sickle(self.__url_api__)
     records = [record for record in s.ListRecords(metadataPrefix='oai_dc')]
     descr = OrderedDict()
     subjects = OrderedDict()
     cats = []
     for record in records:
         item = record.metadata['source'][0].split(',')[1].strip()
         descr[item] = record.metadata
         self.__log__.info('%s', item)
         if 'subject' in record.metadata:
             item_subjects = record.metadata['subject']
             for t in item_subjects:
                 if t in subjects:
                     subjects[t] = subjects[t] + 1
                 else:
                     subjects[t] = 1
                     cats.append(self.subject_to_category(t))
         else:
             self.__log__.warning('  no subject for %s', item)
     self.__log__.info('Parsed %s items', len(records))
     self.__log__.info('Subjects: %s', json.dumps(subjects, indent=2))
     for cat in cats:
         self.__log__.info('  [[%s]]', cat)
         page = self.__site__.pages[cat]
         if not page.exists:
             page.save('[[{}]]'.format(self.__category__), 'Upload cat')
     self.__log__.info('Dumping metadata %s', self.__cache_meta__)
     with open(self.__cache_meta__, 'w') as f:
         json.dump(descr, f, indent=4, ensure_ascii=False)
Esempio n. 19
0
def ssoarharvest(filename='support_data/data_harvest1.json'):
    sickle = Sickle('https://www.ssoar.info/OAIHandler/request')
    records = sickle.ListRecords(metadataPrefix='oai_genios')
    counter = 0
    listofcounter = []
    for r in records:
        counter += 1
        listofcounter.append(r)
        if counter % 10000 == 0:
            print(counter)

    llt = []
    errorls = []
    for index, item in enumerate(listofcounter):
        try:
            llt.append(
                eval(json.dumps(xmltodict.parse(etree.tostring(item.xml)))))
        except:
            errorls.append(index)

    a = {}
    a["result"] = llt

    with open(filename, 'w') as fp:
        json.dump(a, fp, indent=4)
    def fetch_iter(self):

        try:

            sickle = Sickle(self._oaiconfig['OAI']['url'])

            records_iter = sickle.ListRecords(**self.dic)

            for record in records_iter:
                yield record

        except BadArgument as ba:
            self._exception_logger.error(
                "bad argument exception {EXCEPTION}".format(EXCEPTION=str(ba)))
        except OAIError as oaiError:
            self._exception_logger.error(
                "OAIError exception {EXCEPTION}".format(
                    EXCEPTION=str(oaiError)))
        except NoRecordsMatch as noRecordsmatch:
            self._summary_logger.error("no records matched {EXCEPTION}".format(
                EXCEPTION=str(noRecordsmatch)))
        except Exception as baseException:
            self._summary_logger.error(
                "base exception occured - not directly related to OAI {EXCEPTION}"
                .format(EXCEPTION=str(baseException)))
        else:
            print("oai fetching finished successfully")
Esempio n. 21
0
    def _get_database(self, number):
        """
        This object method makes an api call to neliti api and iteratively 
	    yields each record entry for processing, keeping track of the total no 
	    of records that has been successfully processed.
        """

        sickle = Sickle(self.url)
        records = sickle.ListRecords(metadataPrefix='oai_dc',
                                     ignore_deleted=True)
        percentage = 0
        no_of_records = 0
        while 1:
            record = records.next()
            if (record):
                pass
            else:
                break
            data = self._get_record_data(record)
            if data:
                self.database.append(data)
                no_of_records += 1
            else:
                continue
            if ((no_of_records % 100 == 0) and (no_of_records != 0)):
                print("Progress : {no_of_records} records Downloaded".format(
                    no_of_records=no_of_records))
            if (no_of_records == number):
                break
Esempio n. 22
0
def get_records(identifiers, metadata_prefix=None, url=None, name=None,
                encoding=None):
    """Harvest specific records from an OAI repo via OAI-PMH identifiers.

    :param metadata_prefix: The prefix for the metadata return
                            (defaults to 'oai_dc').
    :param identifiers: list of unique identifiers for records to be harvested.
    :param url: The The url to be used to create the endpoint.
    :param name: The name of the OAIHarvestConfig to use instead of passing
                 specific parameters.
    :param encoding: Override the encoding returned by the server. ISO-8859-1
                     if it is not provided by the server.
    :return: request object, list of harvested records
    """
    if name:
        url, _metadata_prefix, _, __ = get_info_by_oai_name(name)

        # In case we provide a prefix, we don't want it to be
        # overwritten by the one we get from the name variable.
        if metadata_prefix is None:
            metadata_prefix = _metadata_prefix
    elif not url:
        raise NameOrUrlMissing(
            "Retry using the parameters -n <name> or -u <url>."
        )

    request = Sickle(url, encoding=encoding)
    records = []
    for identifier in identifiers:
        arguments = {
            'identifier': identifier,
            'metadataPrefix': metadata_prefix or "oai_dc"
        }
        records.append(request.GetRecord(**arguments))
    return request, records
Esempio n. 23
0
File: utils.py Progetto: llcit/llt
    def test_list_oai_collections(self, community):
        """ Constructs list of tuples of collections (a seconday grouping concept
        in OAI) "owned" by the given community.
        
        Utilizes OAI-PMH verbs: ListIdentifiers and ListSets
        """
        sickle = Sickle(community.repository.base_url)

        # Retrieve collections associated with community parameter
        record_headers = sickle.ListIdentifiers(metadataPrefix='oai_dc',
                                                set=community.identifier)
        # Filter record headers to build collection map from the community
        community_collections = {}
        for i in record_headers:
            # Iterate over associated sets looking for collections
            for j in i.setSpecs:
                if j[:3] == 'col':
                    community_collections[
                        j] = None  # register collection id in map

        # Map names to ids in collection map {setSpec: setName}
        # listsets oai request returns the 'setName' of the collection in metadata...
        for i in sickle.ListSets():
            modstr = 'col' + i.setSpec[
                3:]  # Bug in oai? in set results a 'collection' has a prefix of 'com'!
            if modstr in community_collections:  # checks for a mapped collection identifier
                community_collections[modstr] = i.setName

        # Convert map to list of tuples
        self.collections = community_collections.items()

        # Sort collections by name
        self.collections = sorted(self.collections, key=lambda i: i[1])
        return self.collections
Esempio n. 24
0
 def _oai2d_endpoint_identifiers(self):
     """Return a set of the Community OAI Set recids from OAI endpoint."""
     with patch('sickle.app.requests.get', new=sickle_requests_get_mock()):
         sickle = Sickle('http://auditor/oai2d')
         ids = sickle.ListIdentifiers(set=self.community.oaiset_spec,
                                      metadataPrefix='oai_dc')
         return {int(i.identifier.rsplit(':', 1)[-1]) for i in ids}
Esempio n. 25
0
    def __init__(self,
                 name,
                 provider_code,
                 oai_endpoint,
                 metadata_prefix,
                 set_,
                 constant_fields: dict = None,
                 parser: Callable = None,
                 transformer=None,
                 endpoints=None,
                 default_endpoint: str = "recid",
                 endpoint_mapping=None,
                 pid_field=None,
                 from_: str = None,
                 endpoint_handler: dict = None,
                 bulk: bool = True,
                 pre_processors: dict = None,
                 post_processors: dict = None,
                 index: str = None):

        # Counters
        self.only_fetch = False
        self.deleted = 0
        self.created = 0
        self.modified = 0

        if endpoint_mapping is None:  # pragma: no cover
            endpoint_mapping = {}
        if pid_field is None:
            self.pid_field = current_app.config.get('PIDSTORE_RECID_FIELD',
                                                    "recid")
        else:  # pragma: no cover
            self.pid_field = pid_field
        self.name = name
        self.provider_code = provider_code
        self.metadata_prefix = metadata_prefix
        self.oai_endpoint = oai_endpoint
        self.oai_sync = None
        self.sickle = Sickle(self.oai_endpoint)
        self.parser = parser
        self.transformer = transformer
        self.endpoints = endpoints
        self.default_endpoint = default_endpoint
        self.endpoint_mapping = endpoint_mapping
        self.set_ = set_
        if constant_fields:
            self.constant_fields = constant_fields
        else:
            self.constant_fields = {}
        self._from = None
        if from_:
            self.from_ = from_
        self.endpoint_handler = endpoint_handler
        self.bulk = bulk
        self.pre_processors = pre_processors
        self.post_processors = post_processors
        self.overwrite = False
        self.es_client = current_search_client
        self._index = index
Esempio n. 26
0
 def test_override_encoding(self):
     mock_response = Mock(text='<xml/>', content='<xml/>', status_code=200)
     mock_get = Mock(return_value=mock_response)
     with patch.object(Session, 'get', mock_get):
         sickle = Sickle('url', encoding='encoding')
         sickle.ListSets()
         mock_get.assert_called_once_with('url',
                                          params={'verb': 'ListSets'})
Esempio n. 27
0
File: utils.py Progetto: llcit/llt
 def harvest_oai_collection_records_sickle(self, collection):
     sickle = Sickle(collection.community.repository.base_url)
     sickle.class_mapping['ListRecords'] = LltRecord
     sickle.class_mapping['GetRecord'] = LltRecord
     records = sickle.ListRecords(metadataPrefix='dim',
                                  ignore_deleted=True,
                                  set=collection.identifier)
     return records
Esempio n. 28
0
def _fetch_records(endpoint, count):
    subset = []
    sickle = Sickle(endpoint)
    records = sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True)
    for idx, rec in enumerate(records):
        if idx == count:
            break
        subset.append(rec.metadata)
    return subset
Esempio n. 29
0
def harvest_oai(**kwargs):
    """Create OAI ListRecords Iterator for Harvesting Data."""
    oai_endpoint = kwargs.get("oai_endpoint")
    harvest_params = kwargs.get("harvest_params")
    logging.info("Harvesting from %s", oai_endpoint)
    logging.info("Harvesting %s", harvest_params)
    request = Sickle(oai_endpoint, retry_status_codes=[500, 503])
    data = request.ListRecords(**harvest_params)
    return data
Esempio n. 30
0
 def parse_single(self, response):
     sickle = Sickle(self.url)
     params = {
         'metadataPrefix': self.format,
         'identifier': response.meta['identifier'],
     }
     record = sickle.GetRecord(**params)
     self._crawled_records[params['identifier']] = record
     response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
     selector = Selector(response, type='xml')
     return self.parse_record(selector)
Esempio n. 31
0
class TestCase(unittest.TestCase):

    def setUp(self):
        mock.patch('sickle.app.Sickle.harvest', fake_harvest).start()
        self.sickle = Sickle('fake_url')

    def test_OAIResponse(self):
        response = self.sickle.harvest(verb='ListRecords', metadataPrefix='oai_dc')
        response.xml
        response.raw

    def test_broken_XML(self):
        response = self.sickle.harvest(
            verb='ListRecords', resumptionToken='ListRecordsBroken.xml')
        response.xml
        response.raw

    def test_ListRecords(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc')
        assert len([r for r in records]) == 8

    def test_ListRecords_ignore_deleted(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True)
        # There are twelve deleted records in the test data
        num_records = len([r for r in records])
        assert num_records == 4


    def test_ListSets(self):

        sets = self.sickle.ListSets()
        num_sets = len([s for s in sets])
        assert num_sets == 131
        dict(s)

    def test_ListMetadataFormats(self):
        mdfs = self.sickle.ListMetadataFormats()
        num_mdfs = len([mdf for mdf in mdfs])
        assert num_mdfs == 5

        dict(mdf)

    def test_ListIdentifiers(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc')
        assert len([r for r in records]) == 4

    def test_ListIdentifiers_ignore_deleted(self):
        records = self.sickle.ListIdentifiers(
            metadataPrefix='oai_dc', ignore_deleted=True)
            # There are 2 deleted headers in the test data
        num_records = len([r for r in records])
        assert num_records == 2


    def test_Identify(self):
        identify = self.sickle.Identify()
        assert hasattr(identify, 'repositoryName')
        assert hasattr(identify, 'baseURL')
        assert hasattr(identify, 'adminEmail')
        assert hasattr(identify, 'earliestDatestamp')
        assert hasattr(identify, 'deletedRecord')
        assert hasattr(identify, 'granularity')
        assert hasattr(identify, 'description')
        assert hasattr(identify, 'oai_identifier')
        assert hasattr(identify, 'sampleIdentifier')
        dict(identify)

    def test_GetRecord(self):
        oai_id = 'oai:test.example.com:1996652'
        record = self.sickle.GetRecord(identifier=oai_id)
        assert record.header.identifier == oai_id
        assert oai_id in record.raw
        record.xml
        str(record)
        unicode(record)
        dict(record.header)
        dict(record.origin)
        assert dict(record) == record.metadata

    # Test OAI-specific exceptions

    @raises(BadArgument)
    def test_badArgument(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc',
            error='badArgument')

    @raises(CannotDisseminateFormat)
    def test_cannotDisseminateFormat(self):
        records = self.sickle.ListRecords(
            metadataPrefix='oai_dc', error='cannotDisseminateFormat')

    @raises(IdDoesNotExist)
    def test_idDoesNotExist(self):
        records = self.sickle.GetRecord(
            metadataPrefix='oai_dc', error='idDoesNotExist')

    @raises(NoSetHierarchy)
    def test_idDoesNotExist(self):
        records = self.sickle.ListSets(
            metadataPrefix='oai_dc', error='noSetHierarchy')


    @raises(BadResumptionToken)
    def test_badResumptionToken(self):
        records = self.sickle.ListRecords(
            metadataPrefix='oai_dc', error='badResumptionToken')

    @raises(NoRecordsMatch)
    def test_noRecordsMatch(self):
        records = self.sickle.ListRecords(
            metadataPrefix='oai_dc', error='noRecordsMatch')

    @raises(OAIError)
    def test_undefined_OAI_error_XML(self):
        records = self.sickle.ListRecords(
            metadataPrefix='oai_dc', error='undefinedError')

    @mock.patch('sickle.app.Sickle.harvest', fake_harvest)
    def test_OAIResponseIterator(self):
        sickle = Sickle('fake_url', rtype='response')
        records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')]
        assert len(records) == 4
Esempio n. 32
0
 def setUp(self):
     mock.patch('sickle.app.Sickle.harvest', fake_harvest).start()
     self.sickle = Sickle('http://localhost')
Esempio n. 33
0
 def setUp(self):
     self.patch.start()
     self.sickle = Sickle('http://localhost')
Esempio n. 34
0
class TestCase(unittest.TestCase):


    def __init__(self, methodName='runTest'):
        super(TestCase, self).__init__(methodName)
        self.patch = mock.patch('sickle.app.Sickle.harvest', mock_harvest)

    def setUp(self):
        self.patch.start()
        self.sickle = Sickle('http://localhost')

    def tearDown(self):
        self.patch.stop()

    def test_OAIResponse(self):
        response = self.sickle.harvest(verb='ListRecords',
                                       metadataPrefix='oai_dc')
        self.assertIsInstance(response.xml, etree._Element)
        self.assertIsInstance(response.raw, string_types)

    def test_broken_XML(self):
        response = self.sickle.harvest(
            verb='ListRecords', resumptionToken='ListRecordsBroken.xml')
        self.assertEqual(response.xml, None)
        self.assertIsInstance(response.raw, string_types)

    def test_ListRecords(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc')
        assert len([r for r in records]) == 8

    def test_ListRecords_ignore_deleted(self):
        records = self.sickle.ListRecords(metadataPrefix='oai_dc',
                                          ignore_deleted=True)
        num_records = len([r for r in records])
        assert num_records == 4

    def test_ListSets(self):
        set_iterator = self.sickle.ListSets()
        sets = [s for s in set_iterator]
        self.assertEqual(131, len(sets))
        dict(sets[0])

    def test_ListMetadataFormats(self):
        mdf_iterator = self.sickle.ListMetadataFormats()
        mdfs = [mdf for mdf in mdf_iterator]
        self.assertEqual(5, len(mdfs))
        dict(mdfs[0])

    def test_ListIdentifiers(self):
        records = self.sickle.ListIdentifiers(metadataPrefix='oai_dc')
        self.assertEqual(len([r for r in records]), 4)

    def test_ListIdentifiers_ignore_deleted(self):
        records = self.sickle.ListIdentifiers(
            metadataPrefix='oai_dc', ignore_deleted=True)
        # There are 2 deleted headers in the test data
        num_records = len([r for r in records])
        self.assertEqual(num_records, 2)

    def test_Identify(self):
        identify = self.sickle.Identify()
        self.assertTrue(hasattr(identify, 'repositoryName'))
        self.assertTrue(hasattr(identify, 'baseURL'))
        self.assertTrue(hasattr(identify, 'adminEmail'))
        self.assertTrue(hasattr(identify, 'earliestDatestamp'))
        self.assertTrue(hasattr(identify, 'deletedRecord'))
        self.assertTrue(hasattr(identify, 'granularity'))
        self.assertTrue(hasattr(identify, 'description'))
        self.assertTrue(hasattr(identify, 'oai_identifier'))
        self.assertTrue(hasattr(identify, 'sampleIdentifier'))
        dict(identify)

    def test_GetRecord(self):
        oai_id = 'oai:test.example.com:1996652'
        record = self.sickle.GetRecord(identifier=oai_id)
        self.assertEqual(record.header.identifier, oai_id)
        self.assertIn(oai_id, record.raw)
        self.assertEqual(record.header.datestamp, '2011-09-05T12:51:52Z')
        self.assertIsInstance(record.xml, etree._Element)
        binary_type(record)
        text_type(record)
        dict(record.header)
        self.assertEqual(dict(record), record.metadata)

    # Test OAI-specific exceptions

    @raises(BadArgument)
    def test_badArgument(self):
        self.sickle.ListRecords(metadataPrefix='oai_dc',
                                error='badArgument')

    @raises(CannotDisseminateFormat)
    def test_cannotDisseminateFormat(self):
        self.sickle.ListRecords(
            metadataPrefix='oai_dc', error='cannotDisseminateFormat')

    @raises(IdDoesNotExist)
    def test_idDoesNotExist(self):
        self.sickle.GetRecord(
            metadataPrefix='oai_dc', error='idDoesNotExist')

    @raises(NoSetHierarchy)
    def test_noSetHierarchy(self):
        self.sickle.ListSets(
            metadataPrefix='oai_dc', error='noSetHierarchy')

    @raises(BadResumptionToken)
    def test_badResumptionToken(self):
        self.sickle.ListRecords(
            metadataPrefix='oai_dc', error='badResumptionToken')

    @raises(NoRecordsMatch)
    def test_noRecordsMatch(self):
        self.sickle.ListRecords(
            metadataPrefix='oai_dc', error='noRecordsMatch')

    @raises(OAIError)
    def test_undefined_OAI_error_XML(self):
        self.sickle.ListRecords(
            metadataPrefix='oai_dc', error='undefinedError')

    def test_OAIResponseIterator(self):
        sickle = Sickle('fake_url', iterator=OAIResponseIterator)
        records = [r for r in sickle.ListRecords(metadataPrefix='oai_dc')]
        self.assertEqual(len(records), 4)
Esempio n. 35
0
 def setUp(self):
     mock.patch('sickle.app.Sickle.harvest', fake_harvest).start()
     self.sickle = Sickle('fake_url')