コード例 #1
0
    def generate(self):
        """Returns a list of ResourceSync resources that each represent one
        full OAI-PMH record (i.e., the result of a GetRecord request).
        """

        provider = Sickle(self.params['oaipmh_base_url'])
        headers = provider.ListIdentifiers(
            ignore_deleted=True,
            set=self.params['oaipmh_set'],
            metadataPrefix=self.params['oaipmh_metadataprefix'])

        return list(map(self.oaipmh_header_to_resourcesync_resource, headers))
コード例 #2
0
 def test_no_retry(self):
     mock_response = Mock(status_code=503,
                          headers={'retry-after': '10'},
                          raise_for_status=Mock(side_effect=HTTPError))
     mock_get = Mock(return_value=mock_response)
     with patch.object(Session, 'get', mock_get):
         sickle = Sickle('url')
         try:
             sickle.ListRecords()
         except HTTPError:
             pass
         self.assertEqual(1, mock_get.call_count)
コード例 #3
0
def list_records():
    sik = Sickle(URL_PREPRINTS_OAI, verify=False)
    records = sik.ListRecords(
        **{
            'metadataPrefix': 'oai_dc',
            'from': '2021-04-01',
            'until': '2021-04-07',
            'set': 'scielo'
        })
    for r in records:
        doc = doc_raw_attrs(r)
        print(doc)
コード例 #4
0
def get_xml_1(oai_identifier):
    sickle = Sickle("https://dspace.cuni.cz/oai/nusl")
    record = sickle.GetRecord(metadataPrefix="xoai", identifier=oai_identifier)
    file_directory = Path(__file__).parent
    target_directory = file_directory / ".." / "tests" / "data"
    oai_identifier_array = oai_identifier.split(":")
    oai_identifier_fixed = oai_identifier_array[-1]
    oai_identifier_fixed = oai_identifier_fixed.replace(".", "_")
    oai_identifier_fixed = oai_identifier_fixed.replace("/", "-")
    filename = str(target_directory / f"{oai_identifier_fixed}.xml")
    with open(filename, "w+") as f:
        f.write(record.raw)
    print(filename, "created")
コード例 #5
0
ファイル: client.py プロジェクト: MITLibraries/hoard
 def __init__(self, source_url: str, format: str, set: str = None) -> None:
     self.source_url = source_url
     self.format = format
     self.set = set
     self.ids: Optional[Iterator] = None
     client = Sickle(self.source_url)
     self.client = client
     session = requests.Session()
     self.session = session
     self.namespace = {
         "oai": "http://www.openarchives.org/OAI/2.0/",
         "dim": "http://www.dspace.org/xmlns/dspace/dim",
     }
コード例 #6
0
ファイル: test_sickle.py プロジェクト: tulibraries/sickle
 def test_pass_request_args(self):
     mock_response = Mock(text='<xml/>')
     mock_get = Mock(return_value=mock_response)
     with patch('sickle.app.requests.get', mock_get):
         sickle = Sickle('url',
                         timeout=10,
                         proxies=dict(),
                         auth=('user', 'password'))
         sickle.ListRecords()
         mock_get.assert_called_once_with('url',
                                          params={'verb': 'ListRecords'},
                                          timeout=10,
                                          proxies=dict(),
                                          auth=('user', 'password'))
コード例 #7
0
 def test_pass_request_args(self):
     mock_response = Mock(text=u'<xml/>', content='<xml/>', status_code=200)
     mock_get = Mock(return_value=mock_response)
     with patch.object(Session, 'get', mock_get):
         sickle = Sickle('url',
                         timeout=10,
                         proxies=dict(),
                         auth=('user', 'password'))
         sickle.ListRecords()
         mock_get.assert_called_once_with('url',
                                          params={'verb': 'ListRecords'},
                                          timeout=10,
                                          proxies=dict(),
                                          auth=('user', 'password'))
コード例 #8
0
def client_for_repository(repository):
    """
    Return a sickle client object pre-configured for the passed repository.

    """
    # Extra arguments to the Sickle constructor
    client_args = {}

    # If there is a basic auth configuration, add it to the client args
    if repository.basic_auth_user != '':
        client_args['auth'] = (repository.basic_auth_user,
                               repository.basic_auth_password)

    # Construct client object
    return Sickle(repository.url, **client_args)
コード例 #9
0
    def get_oai_pmh_metadata(self, base_url: str) -> Dict[str, str]:
        """Returns a dictionary containing top-level metadata and set metadata of an OAI-PMH repository."""

        logging.debug(
            'Retrieving repository and set metadata from OAI-PMH repository %s',
            base_url)
        try:
            metadata = {}

            # All repositories should have this metadata.
            repository_metadata = Sickle(base_url, timeout=60).Identify()
            if hasattr(repository_metadata, 'repositoryIdentifier'):
                metadata[
                    'repository_identifier'] = repository_metadata.repositoryIdentifier
            if hasattr(repository_metadata, 'repositoryName'):
                metadata[
                    'repository_name'] = repository_metadata.repositoryName

            # Not all repositories will support sets.
            try:
                set_metadata = Sickle(base_url, timeout=60).ListSets()
                metadata.update({
                    'sets': {s.setSpec: s.setName
                             for s in list(set_metadata)}
                })
            except sickle.oaiexceptions.NoSetHierarchy as e:
                logging.debug(
                    'Failed to list sets from OAI-PMH repository %s: %s',
                    base_url, e)

            return metadata

        except requests.RequestException as e:
            raise IndexerError(
                'Failed to get repository metadata from OAI-PMH repository {}: {}'
                .format(base_url, e))
コード例 #10
0
 def test_retry_on_custom_code(self):
     mock_response = Mock(status_code=500,
                          raise_for_status=Mock(side_effect=HTTPError))
     mock_get = Mock(return_value=mock_response)
     with patch.object(Session, 'get', mock_get):
         sickle = Sickle('url',
                         max_retries=3,
                         default_retry_after=0,
                         retry_status_codes=(503, 500))
         try:
             sickle.ListRecords()
         except HTTPError:
             pass
         mock_get.assert_called_with('url', params={'verb': 'ListRecords'})
         self.assertEqual(4, mock_get.call_count)
コード例 #11
0
    def get_events(self, **kwargs):
        LOG.debug(f"Executing {PORTAL_NAME} get events")
        if not self.users:
            LOG.debug("no users. exiting.")
            return False

        records_url = self.portal.get("event_urls", {}).get("oai_pmh_url")

        last_run = datetime.now()
        most_recent_datetime = self.get_most_recent_date(self.users)

        if most_recent_datetime:
            LOG.debug("start date value found in tracker state db entry.")
            from_datetime_str = most_recent_datetime.strftime(
                "%Y-%m-%dT%H:%M:%SZ")
            from_datetime = most_recent_datetime
            LOG.debug("earliest date allowed: {}".format(from_datetime_str))
        else:
            until = tracker_app.app.config.get("DISALLOW_EVENTS_BEFORE")
            if until:
                from_datetime = datetime.strptime(until, "%Y-%m-%dT%H:%M:%SZ")
                from_datetime_str = from_datetime.strftime(
                    "%Y-%m-%dT%H:%M:%SZ")
            else:
                from_datetime = datetime.now() - timedelta(days=1)
                from_datetime_str = from_datetime.strftime(
                    "%Y-%m-%dT%H:%M:%SZ")

        LOG.debug("searching oai-pmh interface: %s" % records_url)
        try:
            sickle = Sickle(records_url)
            records = sickle.ListRecords(**{
                'metadataPrefix': 'oai_dc',
                'from': from_datetime_str
            })
            if records.oai_response.http_response.status_code != 200:
                LOG.debug("non-200 response code received. "
                          "updating tracker status and exiting.")
                self.complete_tracker(
                    records.oai_response.http_response.status_code)
                return False
        except oaiexceptions.NoRecordsMatch:
            LOG.debug("end of records in oai-pmh response")
            self.complete_tracker(
                records.oai_response.http_response.status_code)
            return False

        self.parse_records(records, from_datetime, last_run)
コード例 #12
0
ファイル: reader.py プロジェクト: alueschow/polymatheia
    def __iter__(self):
        """Return a new class:`~polymatheia.data.NavigableDictIterator` as the iterator.

        If ``max_records`` is set, then the class:`~polymatheia.data.NavigableDictIterator` is wrapped in a
        class:`~polymatheia.data.LimitingIterator`.
        """
        it = NavigableDictIterator(
            Sickle(self._url).ListRecords(metadataPrefix=self._metadata_prefix,
                                          set=self._set_spec,
                                          ignore_deleted=True),
            mapper=lambda record: xml_to_navigable_dict(
                etree.fromstring(
                    record.raw, parser=etree.XMLParser(remove_comments=True))))
        if self._max_records is not None:
            it = LimitingIterator(it, self._max_records)
        return it
コード例 #13
0
ファイル: cli.py プロジェクト: unm-art/oai-pmh-harvester
def harvest(host, from_date, until, format, out, set, verbose):
    counter = 0

    if verbose:
        logging.basicConfig(level=logging.DEBUG)
    else:
        logging.basicConfig(level=logging.INFO)

    logging.info("OAI-PMH harvesting from %s", host)
    logging.info("From date = %s", from_date)
    logging.info("Until date = %s", until)
    logging.info("Metadata format = %s", format)
    logging.info("Outfile = %s", out)

    mysickle = Sickle(host, iterator=OAIItemIterator)
    params = {'metadataPrefix': format, 'from': from_date, 'until': until}
    if set is not None:
        params['set'] = set
    try:
        responses = mysickle.ListIdentifiers(**params)
    except NoRecordsMatch:
        logging.info("No records harvested: the combination of the values of "
                     "the arguments results in an empty list.")
        sys.exit()

    identifier_list = []

    for records in responses:
        identifier_list.append(records.identifier)

    logging.info(f"Identifier count to harvest: {len(identifier_list)}")

    with open(out, 'wb') as f:
        f.write('<records>'.encode())

        for identifier in identifier_list:
            r = mysickle.GetRecord(identifier=identifier,
                                   metadataPrefix=format)
            f.write(r.raw.encode('utf8'))
            logging.debug(counter)
            logging.debug(r.raw)
            counter += 1

        f.write('</records>'.encode())

    logging.info("Total records harvested: %i", counter)
コード例 #14
0
def main():

    # inputs
    sleep_ct = 900  # number of records until seconds
    sleep_time = 30  # secs
    base_url = 'http://export.arxiv.org/oai2'
    fname_prefix = "./raw_data/arXiv_oai_dc_"
    fname_log = "./raw_data/harvest.log"

    # create sickle
    sickle = Sickle(base_url)

    # get list of setSpecs
    ls_setSpec = get_ls_setSpec(sickle)
    ct_sets = len(ls_setSpec)

    # read log file to get last harvest date
    dt_last_harvest = get_dt_last_harvest(fname_log)

    # append records
    ct_records = 0
    for setSpec in ls_setSpec:
        print setSpec
        # get data file
        fname_data = fname_prefix + setSpec.replace(":", "_") + ".oai"
        f_data = open(fname_data, 'a')
        # append records
        records = sickle.ListRecords(**{
            "metadataPrefix": "oai_dc",
            "set": setSpec,
            "from": dt_last_harvest
        })
        for record in records:
            ct_records += 1
            f_data.write(str(record.metadata) + '\n')
            f_data.write(str(record.header) + '\n')
            if ct_records % sleep_ct == 0:
                print "sleep for %d secs" % (sleep_time)
                time.sleep(sleep_time)
        f_data.close()

    # log harvest
    logger = csv.writer(open(fname_log, 'a'))
    dt_prev = dt_last_harvest
    dt_curr = datetime.datetime.today().date() - relativedelta(days=1)
    logger.writerow([dt_curr, dt_prev, ct_sets, ct_records])
コード例 #15
0
def list_set_records(setSpec):
    set_recs = []
    sickle = Sickle(admin.get_repository_url())
    try:
        recs = sickle.ListRecords(metadataPrefix='oai_dc', set=setSpec)
        for rec in recs:
            #rec = recs.next()
            set_recs.append({
            "identifier": rec.header.identifier,
            "datestamp": rec.header.datestamp,
            "setSpec": rec.header.setSpecs,
            "dc": rec.metadata,
        })
    except Exception as e:
        pass
    #return [rec_type, rec.metadata, rec.header.identifier, rec.header.setSpecs, rec.header.datestamp, rec.header.deleted, rec.raw]
    return set_recs
コード例 #16
0
    def run(self):
        """
        Run the process for update Pre-prints in Solr.
        """

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        else:

            print("Indexing in {0}".format(self.solr.url))

            sickle = Sickle(self.args.oai_url, verify=False)

            filters = {'metadataPrefix': 'oai_dc'}

            if self.args.time:
                filters['from'] = self.from_date.strftime("%Y-%m-%dT%H:%M:%SZ")

            try:
                records = sickle.ListRecords(**filters)
            except NoRecordsMatch as e:
                print(e)
                sys.exit(0)
            else:

                for i, record in enumerate(records):
                    try:
                        xml = self.pipeline_to_xml(record.xml)
                        print("Indexing record %s with oai id: %s" %
                              (i, record.header.identifier))
                        self.solr.update(xml, commit=True)
                    except ValueError as e:
                        print("ValueError: {0}".format(e))
                        print(e)
                        continue
                    except Exception as e:
                        print("Error: {0}".format(e))
                        print(e)
                        continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
コード例 #17
0
 def test_retry_on_503(self):
     mock_response = Mock(status_code=503,
                          headers={'retry-after': '10'},
                          raise_for_status=Mock(side_effect=HTTPError))
     mock_get = Mock(return_value=mock_response)
     sleep_mock = Mock()
     with patch('time.sleep', sleep_mock):
         with patch.object(Session, 'get', mock_get):
             sickle = Sickle('url', max_retries=3, default_retry_after=0)
             try:
                 sickle.ListRecords()
             except HTTPError:
                 pass
             mock_get.assert_called_with('url',
                                         params={'verb': 'ListRecords'})
             self.assertEqual(4, mock_get.call_count)
             self.assertEqual(3, sleep_mock.call_count)
             sleep_mock.assert_called_with(10)
コード例 #18
0
ファイル: utils.py プロジェクト: llcit/llt
    def list_oai_community_sets(self, repository):
        """ Contructs list of tuples of communities (a grouping concept in OAI) 
        for the given repository.
        Utilizes OAI-PMH verb: ListSets
        """
        try:
            sickle = Sickle(repository.base_url)
            sets = sickle.ListSets()
        except:
            return
        """ Filter set list to build list of community sets """
        for i in sets:
            """ Build community tuples (id, human readable name) """
            if i.setSpec[:3] == 'com':
                set_data = (i.setSpec, i.setName)
                self.communities.append(set_data)

        self.communities = sorted(self.communities, key=lambda i: i[1])
コード例 #19
0
def main():
    if collection == 't':
        collectionID = '129651'
    if collection == 'd':
        collectionID = '129652'

    sickle = Sickle('http://arizona.openrepository.com/arizona/oai/request?')
    # sets = sickle.ListSets()

    recs = sickle.ListRecords(
        **{
            'metadataPrefix': 'oai_dc',
            'set': 'col_10150_' + collectionID,
            'from': date,
            'until': date_until
        })
    # log.debug("Making request to {}".format(recs))
    # try:
    #     response = recs
    # except Exception as e:
    #     log.exception("An error occured in issuing the request!")
    #     raise
    # log.debug("Request completed")
    # # log.debug("Response Code: {}".format(response.status_code))
    # # log.debug("Response text: {}".format(response.text))
    # log.debug("Trying to convert response to JSON...")
    # try:
    #     response = response
    #     log.debug("Response successfully converted to JSON: {}".format(response))
    # except Exception as e:
    #     log.exception("An error occured!")
    #     raise

    # print(recs.url)
    newFile.write('<?xml version="1.0" encoding="utf-8"?>')
    newFile.write(
        '<OAI-PMH xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:OAI-PMH="http://www.openarchives.org/OAI/2.0/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">'
    )
    for r in recs:

        newR = str(r)
        # newR = newR.decode('utf8')
        newFile.write(str(newR))
    newFile.write('</OAI-PMH>')
コード例 #20
0
    def parse_list(self, response):
        sickle = Sickle(self.url)
        params = {
            'metadataPrefix': self.format,
            'set': response.meta['set'],
            'from': response.meta['from_date'],
            'until': self.until_date,
        }
        try:
            records = sickle.ListRecords(**params)
        except NoRecordsMatch as err:
            LOGGER.warning(err)
            raise StopIteration()

        # Avoid timing out the resumption token
        # TODO: implemente a storage-based solution, to be able to handle large
        #       amounts of records.
        records = list(records)
        LOGGER.info(
            'Harvested %s record for params %s',
            len(records),
            params,
        )
        for record in records:
            rec_identifier = self.get_record_identifier(record)
            if rec_identifier in self._crawled_records:
                # avoid cross-set repeated records
                LOGGER.info('Skipping duplicated record %s', rec_identifier)
                continue

            LOGGER.debug(
                'Not skipping non-duplicated record %s',
                rec_identifier,
            )

            self._crawled_records[rec_identifier] = record
            response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
            selector = Selector(response, type='xml')

            try:
                yield self.parse_record(selector)
            except Exception as err:
                LOGGER.error(err)
コード例 #21
0
ファイル: utils.py プロジェクト: llcit/slrp
def get_bitstream_url(collection, record_in):
    """ Harvests an href pointing to the bitstream urls for the record in repository.
    E.g., https://scholarspace.manoa.hawaii.edu/bitstream/10125/25006/1/editor.pdf
    """

    sickle = Sickle(collection.community.repository.base_url)        
    sickle.class_mapping['GetRecord'] = LltRecordBitstream
    record = sickle.GetRecord(metadataPrefix='ore', identifier=record_in.header.identifier)
    bitstreams = {'bitstream': None, 'bitstream_txt': None}

    try:        
        bitstreams['bitstream'] = record.metadata['bitstream'][0].replace('+', '%20')
    except Exception as e:
        print (e, 'Unable to construct bitstream url for', record_in.header.identifier)

    try:
        bitstreams['bitstream_txt'] = record.metadata['bitstream_txt'][0].replace('+', '%20')
    except Exception as e:
        print (e, 'Unable to construct bitstream_txt url for', record_in.header.identifier)
    
    return bitstreams
コード例 #22
0
def list_sets(repository_url=None):
    repository_url = repository_url or admin.get_repository_url()
    sickle = Sickle(repository_url)
    setlist = []
    listsets = sickle.ListSets()

    try:
        for i in range(500):
            s = listsets.next()
            setlist.append(
                {
                    'setSpec': s.setSpec,
                    'setName': s.setName,
                }
            )
    except StopIteration:
        pass
    except Exception as e:
        abort(400, e)

    return setlist
コード例 #23
0
 def get_direct_records(context, params):
     records = []
     i = 0
     root = OAIBridge.data["contexts"][context]
     for name in root:
         sickle = Sickle(root[name]['url'])
         sets = root[name]['sets'] if 'sets' in root[name] else None
         if not sets:
             try:
                 for record in sickle.ListRecords(**params):
                     i += 1
                     if not record.deleted:
                         records.append(record.metadata)
             except NoRecordsMatch:
                 pass
             except:
                 print(traceback.format_exc())
                 break
         else:
             unknown_error = False
             for set_name in sets:
                 new_params = dict(params)
                 new_params['set'] = set_name
                 try:
                     for record in sickle.ListRecords(**new_params):
                         i += 1
                         if not record.deleted:
                             records.append(record.metadata)
                 except NoRecordsMatch:
                     pass
                 except:
                     print(traceback.format_exc())
                     unknown_error = True
                     break
             if unknown_error:
                 break
     return i, records
コード例 #24
0
    def run(self):
        """
        Run the process for update Pre-prints in Solr.
        """

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        else:

            logger.info("Indexing in {0}".format(self.solr.url))

            sickle = Sickle(self.args.oai_url)

            records = sickle.ListRecords(
                **{
                    'metadataPrefix': 'oai_dc',
                    'from': self.from_date.strftime("%Y-%m-%dT%H:%M:%SZ")
                })

            for record in records:
                try:
                    xml = self.pipeline_to_xml(record.xml)
                    self.solr.update(xml, commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
コード例 #25
0
ファイル: updater.py プロジェクト: TejasAvinashShetty/arXiver
    def run(self, update_all=False, override=False):
        arxiv = Sickle('http://export.arxiv.org/oai2')

        # date = datetime.date(2014, 5, 14)
        # records = arxiv.ListRecords(**{'metadataPrefix': 'arXiv', 'from': str(date)})
        # print str(datetime.date(2014, 5, 14))
        last_update = Synchronization.query.order_by(
            Synchronization.id.desc()).first()
        if (datetime.datetime.utcnow() - last_update.date).days < 1:
            return 0

        if last_update is None or update_all:
            date = None
            records = arxiv.ListRecords(metadataPrefix='arXiv')
        else:
            date = last_update.date.date()
            records = arxiv.ListRecords(**{
                'metadataPrefix': 'arXiv',
                'from': str(date)
            })

        count = 0
        badrecords = []
        for r in records:
            count += 1
            if count % 1000 == 0: print count
            try:
                a = self.add_article(r.metadata)
            except Exception as e:
                badrecords.append(r)
                print "Exception: ", e
            # print a.title
            db.session.commit()
        db.session.add(Synchronization(date=datetime.datetime.now()))
        db.session.commit()
        print "all done!"
        return count
コード例 #26
0
def oai_get_record(id,
                   name,
                   transformation,
                   record_cls,
                   access_token=None,
                   identifier=None,
                   dbcommit=False,
                   reindex=False,
                   test_md5=False,
                   verbose=False,
                   debug=False,
                   **kwargs):
    """Get record from an OAI repo.

    :param identifier: identifier of record.
    """
    url, metadata_prefix, lastrun, setspecs = get_info_by_oai_name(name)

    request = Sickle(url)

    params = {}
    if access_token:
        params['accessToken'] = access_token

    params['metadataPrefix'] = metadata_prefix
    params['identifier'] = f'{identifier}{id}'
    try:
        record = request.GetRecord(**params)
    except Exception as err:
        if debug:
            raise Exception(err)
        return None
    records = parse_xml_to_array(StringIO(record.raw))
    trans_record = transformation(records[0]).json
    if verbose:
        click.echo(f'OAI-{name} get: {id}')
    return trans_record
コード例 #27
0
    def __init__(self,
                 endpoint='http://export.arxiv.org/oai2',
                 metadataPrefix='oai_dc',
                 harvest_set='cs',
                 recsFrom=str(date.today() - relativedelta(days=1)),
                 recsUntil=''):

        self.endpoint = endpoint
        self.metadataPrefix = metadataPrefix
        self.harvest_set = harvest_set
        self.recsFrom = recsFrom
        if not recsUntil:
            self.recsUntil = date.today()
        else:
            self.recsUntil = recsUntil
        #self.recsUntil = recsUntil

        self.responses = None
        self.recs = []

        self.idfiers = []
        self.descriptions = []

        self.sickle = Sickle(endpoint, iterator=OAIResponseIterator)
コード例 #28
0
    print("[" + now + "]\t" + text)
    # forces to output the result of the print command immediately, see: http://stackoverflow.com/questions/230751/how-to-flush-output-of-python-print
    sys.stdout.flush()


runningFromWithinStabi = False
# main PPN harvesting
savedRecords = []

if runningFromWithinStabi:
    proxy = urllib.request.ProxyHandler({})
    opener = urllib.request.build_opener(proxy)
    urllib.request.install_opener(opener)

# create OAI-PMH reader pointing to the Stabi OAI-PMH endpoint of the digitzed collections
sickle = Sickle('http://digital.staatsbibliothek-berlin.de/oai')
records = sickle.ListRecords(metadataPrefix='oai_dc', set='DC_all')

if True:
    printLog("Starting OAI-PMH record download...")
    # initialize some variables for counting and saving the metadata records
    savedDocs = 0

    maxDocs = 146000  # 100 is just for testing, for more interesting results increase this value to 1000. ATTENTION! this will also take more time for reading data.

    # save the records locally as we don't want to have to rely on a connection to the OAI-PMH server all the time
    # iterate over all records until maxDocs is reached
    # ATTENTION! if you re-run this cell, the contents of the savedRecords array will be altered!
    for record in records:
        # check if we reach the maximum document value
        if savedDocs < maxDocs:
コード例 #29
0
#!/usr/bin/python3
from sickle import Sickle
from pymarc import Record, Field, MARCWriter
import os
import re

# configurations
save_file = 'c:\\users\\user\\desktop\\books.dat'

# delete old file (if exists)
os.system(f'del {save_file}')

# load OAI-PMH client
# documentation: https://sickle.readthedocs.io/en/latest/
sickle = Sickle('http://content.cdlib.org/oai')
records = sickle.ListRecords(metadataPrefix='oai_dc', set='YOUR_SET_ID_HERE')

# parse harvested records and generate MARC21
for record in records:
    # parse dc record
    dc  = record.metadata
    contributors = dc.get("contributor", [])
    coverages = dc.get("coverage", [])
    creators = dc.get("creator", [])
    dates = dc.get("date", [])
    descriptions = dc.get("description", [])
    formats = dc.get("format", [])
    identifiers = dc.get("identifier", [])
    languages = dc.get("language", [])
    publishers = dc.get("publisher", [])
    relations = dc.get("relation", [])
コード例 #30
0
ファイル: test_sickle.py プロジェクト: lnielsen/sickle
 def test_invalid_iterator(self):
     Sickle("http://localhost", iterator=None)