コード例 #1
0
class RemoteOAIRELoader(BaseOAIRELoader):
    """Remote OpenAIRE dataset loader.

    Fetch the OpenAIRE records from a remote OAI-PMH endpoint.
    """
    def __init__(self, source=None, setspec=None, **kwargs):
        """Init the loader for remote OAI-PMH access."""
        super(RemoteOAIRELoader, self).__init__(
            source or current_app.config['OPENAIRE_OAIPMH_ENDPOINT'], **kwargs)
        self.client = Sickle(self.source)
        self.setspec = setspec or \
            current_app.config['OPENAIRE_OAIPMH_DEFAULT_SET'],

    def iter_grants(self, as_json=True):
        """Fetch grants from a remote OAI-PMH endpoint.

        Return the Sickle-provided generator object.
        """
        records = self.client.ListRecords(metadataPrefix='oaf',
                                          set=self.setspec)
        for rec in records:
            try:
                grant_out = rec.raw  # rec.raw is XML
                if as_json:
                    grant_out = self.grantxml2json(grant_out)
                yield grant_out
            except FunderNotFoundError as e:
                current_app.logger.warning("Funder '{0}' not found.".format(
                    e.funder_id))
コード例 #2
0
def download_oai_dc(outfile=None,
                    base_url='http://bibliotheque-numerique.inha.fr/oai',
                    sets=16800,
                    force_refresh=False):
    """Downloads a set from a oai-pmh repository and returns it
    if given an outfile, save the resuls to it,
    will also use it as a cache if needed
    """
    if os.path.exists(outfile) and not force_refresh:
        return read_json_gzip(outfile)

    sickle = Sickle(base_url)
    records = sickle.ListRecords(**{
        'metadataPrefix': 'oai_dc',
        'set': "oai:sets:%d" % sets
    })

    records_fetched = list()
    i = 0
    for record in tqdm(records):
        if i == 100:
            break
        records_fetched.append(record.metadata)
        i += 1
    records_fetched = records_fetched
    if outfile:
        write_json_gzip(outfile, records_fetched)

    return records_fetched
コード例 #3
0
 def descriptions(self):
     """Retrive descriptions and dumps it in cache file"""
     s = Sickle(self.__url_api__)
     records = [record for record in s.ListRecords(metadataPrefix='oai_dc')]
     descr = OrderedDict()
     subjects = OrderedDict()
     cats = []
     for record in records:
         item = record.metadata['source'][0].split(',')[1].strip()
         descr[item] = record.metadata
         self.__log__.info('%s', item)
         if 'subject' in record.metadata:
             item_subjects = record.metadata['subject']
             for t in item_subjects:
                 if t in subjects:
                     subjects[t] = subjects[t] + 1
                 else:
                     subjects[t] = 1
                     cats.append(self.subject_to_category(t))
         else:
             self.__log__.warning('  no subject for %s', item)
     self.__log__.info('Parsed %s items', len(records))
     self.__log__.info('Subjects: %s', json.dumps(subjects, indent=2))
     for cat in cats:
         self.__log__.info('  [[%s]]', cat)
         page = self.__site__.pages[cat]
         if not page.exists:
             page.save('[[{}]]'.format(self.__category__), 'Upload cat')
     self.__log__.info('Dumping metadata %s', self.__cache_meta__)
     with open(self.__cache_meta__, 'w') as f:
         json.dump(descr, f, indent=4, ensure_ascii=False)
コード例 #4
0
def main():

    sickle = Sickle('http://arizona.openrepository.com/arizona/oai/request?')
    # sets = sickle.ListSets()

    recs = sickle.ListRecords(**{'metadataPrefix':'oai_dc','set':'com_10150_129649','from':'2017-04-05'})
    # log.debug("Making request to {}".format(recs))
    # try:
    #     response = recs
    # except Exception as e:
    #     log.exception("An error occured in issuing the request!")
    #     raise
    # log.debug("Request completed")
    # # log.debug("Response Code: {}".format(response.status_code))
    # # log.debug("Response text: {}".format(response.text))
    # log.debug("Trying to convert response to JSON...")
    # try:
    #     response = response
    #     log.debug("Response successfully converted to JSON: {}".format(response))
    # except Exception as e:
    #     log.exception("An error occured!")
    #     raise


    # print(recs.url)
    newFile.write('<?xml version="1.0" encoding="utf-8"?>')
    newFile.write('<OAI-PMH xmlns:dc="http://purl.org/dc/elements/1.1/" xmlns:OAI-PMH="http://www.openarchives.org/OAI/2.0/" xmlns:oai_dc="http://www.openarchives.org/OAI/2.0/oai_dc/" xmlns:xsi="http://www.w3.org/2001/XMLSchema-instance" xsi:schemaLocation="http://www.openarchives.org/OAI/2.0/ http://www.openarchives.org/OAI/2.0/OAI-PMH.xsd">')
    for r in recs:

        newR = str(r).encode('utf8')
        newR = newR.decode('utf8')
        newFile.write(str(newR))
    newFile.write('</OAI-PMH>')
コード例 #5
0
 def run(self):
     timestamp = datetime.utcnow()
     sickle = Sickle('http://invenio.nusl.cz/oai2d/')
     sickle.class_mapping['ListRecords'] = MarcXMLParser
     sickle.class_mapping['GetRecord'] = MarcXMLParser
     oai_logger.info("Loading records")
     records = sickle.ListRecords(metadataPrefix='marcxml')
     for idx, record in enumerate(records):
         print(f"{idx}. {record.id}")
         oai_logger.info(f"{idx}. {record.id}")
         try:
             current_search_client.index(
                 index=self.index,
                 id=record.marc_dict["001"],
                 body=record.marc_dict
             )
         except:
             exc_traceback = traceback.format_exc()
             print(exc_traceback)
             print("\n\n\n")
             file_name = f'{timestamp.strftime("%Y%m%dT%H%M%S")}.err'
             file_path = os.path.join(self.path, file_name)
             with open(file_path, "a") as f:
                 f.write(
                     f"Dictionary: {record.marc_dict}\n\n"
                     f"{exc_traceback}\n\n\n\n")
             continue
コード例 #6
0
    def fetch_oai_recs_day(self, date):
        """
        Generator that returns the key and full record
        of works deposited on a particular day.

        :param day:
        :return: str key, str rec
        """
        api = Sickle(self.endpoint_url)
        date_str = date.isoformat()
        # this dict kwargs hack is to work around 'from' as a reserved python keyword
        # recommended by sickle docs
        # Question: Why would someone insist on using a keyword for a parameter?
        try:
            records = api.ListRecords(
                **{
                    'metadataPrefix': self.metadata_prefix,
                    'from': date_str,
                    'until': date_str,
                })
        except oaiexceptions.NoRecordsMatch:
            records = []
            _LOGGER.info('OAI request produced no records.')

        for item in records:
            yield item.header.identifier.encode('utf-8'), item.raw.encode(
                'utf-8')
コード例 #7
0
    def fetch_iter(self):

        try:

            sickle = Sickle(self._oaiconfig['OAI']['url'])

            records_iter = sickle.ListRecords(**self.dic)

            for record in records_iter:
                yield record

        except BadArgument as ba:
            self._exception_logger.error(
                "bad argument exception {EXCEPTION}".format(EXCEPTION=str(ba)))
        except OAIError as oaiError:
            self._exception_logger.error(
                "OAIError exception {EXCEPTION}".format(
                    EXCEPTION=str(oaiError)))
        except NoRecordsMatch as noRecordsmatch:
            self._summary_logger.error("no records matched {EXCEPTION}".format(
                EXCEPTION=str(noRecordsmatch)))
        except Exception as baseException:
            self._summary_logger.error(
                "base exception occured - not directly related to OAI {EXCEPTION}"
                .format(EXCEPTION=str(baseException)))
        else:
            print("oai fetching finished successfully")
コード例 #8
0
def main():
    parser = argparse.ArgumentParser()
    parser.add_argument('-f', '--from_date', required=True)
    params = parser.parse_args()

    logging.basicConfig(level=logging.INFO,
                        format='[%(asctime)s] %(levelname)s %(message)s',
                        datefmt='%d/%b/%Y %H:%M:%S')

    oai_client = Sickle(endpoint=OAI_PMH_PREPRINT_ENDPOINT,
                        max_retries=3,
                        verify=False)
    records = oai_client.ListRecords(**{
        'metadataPrefix': OAI_METADATA_PREFIX,
        'from': params.from_date
    })

    logging.info('Obtendo dados do OAI-PMH Preprints para date >= %s' %
                 params.from_date)
    data = {}
    for r in records:
        data.update(parse(r))

    filename = ''.join([PREPRINT_DICTIONARY_PREFIX, params.from_date, '.json'])
    save(data, filename)
コード例 #9
0
    def _get_database(self, number):
        """
        This object method makes an api call to neliti api and iteratively 
	    yields each record entry for processing, keeping track of the total no 
	    of records that has been successfully processed.
        """

        sickle = Sickle(self.url)
        records = sickle.ListRecords(metadataPrefix='oai_dc',
                                     ignore_deleted=True)
        percentage = 0
        no_of_records = 0
        while 1:
            record = records.next()
            if (record):
                pass
            else:
                break
            data = self._get_record_data(record)
            if data:
                self.database.append(data)
                no_of_records += 1
            else:
                continue
            if ((no_of_records % 100 == 0) and (no_of_records != 0)):
                print("Progress : {no_of_records} records Downloaded".format(
                    no_of_records=no_of_records))
            if (no_of_records == number):
                break
コード例 #10
0
def ssoarharvest(filename='support_data/data_harvest1.json'):
    sickle = Sickle('https://www.ssoar.info/OAIHandler/request')
    records = sickle.ListRecords(metadataPrefix='oai_genios')
    counter = 0
    listofcounter = []
    for r in records:
        counter += 1
        listofcounter.append(r)
        if counter % 10000 == 0:
            print(counter)

    llt = []
    errorls = []
    for index, item in enumerate(listofcounter):
        try:
            llt.append(
                eval(json.dumps(xmltodict.parse(etree.tostring(item.xml)))))
        except:
            errorls.append(index)

    a = {}
    a["result"] = llt

    with open(filename, 'w') as fp:
        json.dump(a, fp, indent=4)
コード例 #11
0
ファイル: utils.py プロジェクト: llcit/llt
 def harvest_oai_collection_records_sickle(self, collection):
     sickle = Sickle(collection.community.repository.base_url)
     sickle.class_mapping['ListRecords'] = LltRecord
     sickle.class_mapping['GetRecord'] = LltRecord
     records = sickle.ListRecords(metadataPrefix='dim',
                                  ignore_deleted=True,
                                  set=collection.identifier)
     return records
コード例 #12
0
ファイル: harvest.py プロジェクト: tulibraries/tulflow
def harvest_oai(**kwargs):
    """Create OAI ListRecords Iterator for Harvesting Data."""
    oai_endpoint = kwargs.get("oai_endpoint")
    harvest_params = kwargs.get("harvest_params")
    logging.info("Harvesting from %s", oai_endpoint)
    logging.info("Harvesting %s", harvest_params)
    request = Sickle(oai_endpoint, retry_status_codes=[500, 503])
    data = request.ListRecords(**harvest_params)
    return data
コード例 #13
0
def _fetch_records(endpoint, count):
    subset = []
    sickle = Sickle(endpoint)
    records = sickle.ListRecords(metadataPrefix='oai_dc', ignore_deleted=True)
    for idx, rec in enumerate(records):
        if idx == count:
            break
        subset.append(rec.metadata)
    return subset
コード例 #14
0
ファイル: test_sickle.py プロジェクト: titabo2k/sickle
 def test_pass_request_args(self):
     mock_response = Mock(text=u'<xml/>', content='<xml/>')
     mock_get = Mock(return_value=mock_response)
     with patch('sickle.app.requests.get', mock_get):
         sickle = Sickle('url', timeout=10, proxies=dict(),
                         auth=('user', 'password'))
         sickle.ListRecords()
         mock_get.assert_called_once_with('url',
                                          params={'verb': 'ListRecords'},
                                          timeout=10, proxies=dict(),
                                          auth=('user', 'password'))
コード例 #15
0
def crawl_metadata(output_file, oai_params=None, fetch_limit=None):
    """
    crawls records, flushes them regularily to a temporary json file.
    low memory footprint, no loss of intermediate results.
    """
    sickle = Sickle(URL_OAI2)
    oai_params = oai_params if oai_params else {}

    logger.info("{} - retrieving records from {} with params {}".format(str(datetime.now()), URL_OAI2, str(oai_params)))
    t0 = time.time()
    t_last = t0
    raw_records = sickle.ListRecords(**oai_params)  # type: OAIItemIterator

    metadata_list = []
    records_size = int(raw_records._get_resumption_token().complete_list_size)
    batch_counter = 0
    batch_size = raw_records.oai_response.http_response.content.decode().count("</record>")
    batch_sum = int(math.ceil(records_size / float(batch_size)))
    counter = 0
    for raw_record in raw_records:  # type: Record
        # parse element and append
        try:
            identifier, record = parse_raw_record(raw_record)
            if record:
                metadata_list.append(record)
            else:
                logger.debug("Record `{}` was deleted and will therefore not appear in the results.".format(identifier))
        except Exception:
            logger.warning("Failed to parse record %s", str(raw_record), exc_info=1)

        # write batch to file and write log
        counter += 1
        if counter % batch_size == 0:
            # write batch to file
            batch_counter += 1
            util.json_write_lines(metadata_list, output_file, append=(batch_counter > 1))
            metadata_list = []

            # log event
            t_current = time.time()
            t_remaining = ((1 / (counter / records_size)) - 1) * (t_current - t0)
            logger.info("Batch {}/{}: fetched {} of {} records (took {}s, remaining: {} min, resumption token: {})".format(
                batch_counter, batch_sum, counter, records_size, round(t_current - t_last, 2), round(t_remaining / 60, 1),
                raw_records._get_resumption_token().token))
            t_last = t_current
        if fetch_limit and counter >= fetch_limit:
            break

    # write last batch
    if len(metadata_list) > 0:
        logger.info("Batch {}/{}: fetched the remaining {} records".format(batch_sum, batch_sum, len(metadata_list)))
        util.json_write_lines(metadata_list, output_file, append=True)

    logger.info("All {} entries were retrieved in {}s and written to {}".format(counter, round(time.time() - t0), output_file))
コード例 #16
0
def download_records(input_file: str):
    with open(input_file, "r") as f:
        reader = csv.reader(f, delimiter=",")
        lines = [line[0] for line in reader]

    out_dir = lines[0]
    names = lines[1:]

    if os.path.exists(out_dir):
        shutil.rmtree(out_dir)
    os.mkdir(out_dir)

    sickle = Sickle(URL)
    recs = sickle.ListRecords(metadataPrefix="oai_dc",
                              set=DATASET,
                              ignore_deleted=True)

    infos = []
    for rec in recs:
        metadata = rec.metadata

        desc = metadata["description"][0]
        find_base_name = re.search("base_name : (.*)\n", desc)
        if find_base_name is not None:
            name = find_base_name.group(1)
            if name in names:
                identifier = metadata["identifier"][0]
                infos.append((name, identifier))

    for name, identifier in infos:

        record_path = os.path.join(out_dir, name)
        if not os.path.exists(record_path):
            os.mkdir(record_path)

        filename = name + ".hdf5"
        features_filename = name + "_features.hdf5"
        wcon_filename = name + ".wcon.zip"

        if not os.path.exists(os.path.join(record_path, filename)):
            url = identifier + "/files/" + urllib.parse.quote(filename)
            urllib.request.urlretrieve(url,
                                       os.path.join(record_path, filename))

            url = identifier + "/files/" + urllib.parse.quote(
                features_filename)
            urllib.request.urlretrieve(
                url, os.path.join(record_path, features_filename))

            url = identifier + "/files/" + urllib.parse.quote(wcon_filename)
            urllib.request.urlretrieve(
                url, os.path.join(record_path, wcon_filename))

            logger.info(f"Downloaded {name}")
コード例 #17
0
 def test_no_retry(self):
     mock_response = Mock(status_code=503,
                          headers={'retry-after': '10'},
                          raise_for_status=Mock(side_effect=HTTPError))
     mock_get = Mock(return_value=mock_response)
     with patch.object(Session, 'get', mock_get):
         sickle = Sickle('url')
         try:
             sickle.ListRecords()
         except HTTPError:
             pass
         self.assertEqual(1, mock_get.call_count)
コード例 #18
0
def list_records():
    sik = Sickle(URL_PREPRINTS_OAI, verify=False)
    records = sik.ListRecords(
        **{
            'metadataPrefix': 'oai_dc',
            'from': '2021-04-01',
            'until': '2021-04-07',
            'set': 'scielo'
        })
    for r in records:
        doc = doc_raw_attrs(r)
        print(doc)
コード例 #19
0
ファイル: updater.py プロジェクト: TejasAvinashShetty/arXiver
    def run(self, update_all=False, override=False):
        arxiv = Sickle('http://export.arxiv.org/oai2')

        # date = datetime.date(2014, 5, 14)
        # records = arxiv.ListRecords(**{'metadataPrefix': 'arXiv', 'from': str(date)})
        # print str(datetime.date(2014, 5, 14))
        last_update = Synchronization.query.order_by(
            Synchronization.id.desc()).first()
        if (datetime.datetime.utcnow() - last_update.date).days < 1:
            return 0

        if last_update is None or update_all:
            date = None
            records = arxiv.ListRecords(metadataPrefix='arXiv')
        else:
            date = last_update.date.date()
            records = arxiv.ListRecords(**{
                'metadataPrefix': 'arXiv',
                'from': str(date)
            })

        count = 0
        badrecords = []
        for r in records:
            count += 1
            if count % 1000 == 0: print count
            try:
                a = self.add_article(r.metadata)
            except Exception as e:
                badrecords.append(r)
                print "Exception: ", e
            # print a.title
            db.session.commit()
        db.session.add(Synchronization(date=datetime.datetime.now()))
        db.session.commit()
        print "all done!"
        return count
コード例 #20
0
 def get_direct_records(context, params):
     records = []
     i = 0
     root = OAIBridge.data["contexts"][context]
     for name in root:
         sickle = Sickle(root[name]['url'])
         sets = root[name]['sets'] if 'sets' in root[name] else None
         if not sets:
             try:
                 for record in sickle.ListRecords(**params):
                     i += 1
                     if not record.deleted:
                         records.append(record.metadata)
             except NoRecordsMatch:
                 pass
             except:
                 print(traceback.format_exc())
                 break
         else:
             unknown_error = False
             for set_name in sets:
                 new_params = dict(params)
                 new_params['set'] = set_name
                 try:
                     for record in sickle.ListRecords(**new_params):
                         i += 1
                         if not record.deleted:
                             records.append(record.metadata)
                 except NoRecordsMatch:
                     pass
                 except:
                     print(traceback.format_exc())
                     unknown_error = True
                     break
             if unknown_error:
                 break
     return i, records
コード例 #21
0
 def test_retry_on_custom_code(self):
     mock_response = Mock(status_code=500,
                          raise_for_status=Mock(side_effect=HTTPError))
     mock_get = Mock(return_value=mock_response)
     with patch.object(Session, 'get', mock_get):
         sickle = Sickle('url',
                         max_retries=3,
                         default_retry_after=0,
                         retry_status_codes=(503, 500))
         try:
             sickle.ListRecords()
         except HTTPError:
             pass
         mock_get.assert_called_with('url', params={'verb': 'ListRecords'})
         self.assertEqual(4, mock_get.call_count)
コード例 #22
0
    def get_events(self, **kwargs):
        LOG.debug(f"Executing {PORTAL_NAME} get events")
        if not self.users:
            LOG.debug("no users. exiting.")
            return False

        records_url = self.portal.get("event_urls", {}).get("oai_pmh_url")

        last_run = datetime.now()
        most_recent_datetime = self.get_most_recent_date(self.users)

        if most_recent_datetime:
            LOG.debug("start date value found in tracker state db entry.")
            from_datetime_str = most_recent_datetime.strftime(
                "%Y-%m-%dT%H:%M:%SZ")
            from_datetime = most_recent_datetime
            LOG.debug("earliest date allowed: {}".format(from_datetime_str))
        else:
            until = tracker_app.app.config.get("DISALLOW_EVENTS_BEFORE")
            if until:
                from_datetime = datetime.strptime(until, "%Y-%m-%dT%H:%M:%SZ")
                from_datetime_str = from_datetime.strftime(
                    "%Y-%m-%dT%H:%M:%SZ")
            else:
                from_datetime = datetime.now() - timedelta(days=1)
                from_datetime_str = from_datetime.strftime(
                    "%Y-%m-%dT%H:%M:%SZ")

        LOG.debug("searching oai-pmh interface: %s" % records_url)
        try:
            sickle = Sickle(records_url)
            records = sickle.ListRecords(**{
                'metadataPrefix': 'oai_dc',
                'from': from_datetime_str
            })
            if records.oai_response.http_response.status_code != 200:
                LOG.debug("non-200 response code received. "
                          "updating tracker status and exiting.")
                self.complete_tracker(
                    records.oai_response.http_response.status_code)
                return False
        except oaiexceptions.NoRecordsMatch:
            LOG.debug("end of records in oai-pmh response")
            self.complete_tracker(
                records.oai_response.http_response.status_code)
            return False

        self.parse_records(records, from_datetime, last_run)
コード例 #23
0
def main():

    # inputs
    sleep_ct = 900  # number of records until seconds
    sleep_time = 30  # secs
    base_url = 'http://export.arxiv.org/oai2'
    fname_prefix = "./raw_data/arXiv_oai_dc_"
    fname_log = "./raw_data/harvest.log"

    # create sickle
    sickle = Sickle(base_url)

    # get list of setSpecs
    ls_setSpec = get_ls_setSpec(sickle)
    ct_sets = len(ls_setSpec)

    # read log file to get last harvest date
    dt_last_harvest = get_dt_last_harvest(fname_log)

    # append records
    ct_records = 0
    for setSpec in ls_setSpec:
        print setSpec
        # get data file
        fname_data = fname_prefix + setSpec.replace(":", "_") + ".oai"
        f_data = open(fname_data, 'a')
        # append records
        records = sickle.ListRecords(**{
            "metadataPrefix": "oai_dc",
            "set": setSpec,
            "from": dt_last_harvest
        })
        for record in records:
            ct_records += 1
            f_data.write(str(record.metadata) + '\n')
            f_data.write(str(record.header) + '\n')
            if ct_records % sleep_ct == 0:
                print "sleep for %d secs" % (sleep_time)
                time.sleep(sleep_time)
        f_data.close()

    # log harvest
    logger = csv.writer(open(fname_log, 'a'))
    dt_prev = dt_last_harvest
    dt_curr = datetime.datetime.today().date() - relativedelta(days=1)
    logger.writerow([dt_curr, dt_prev, ct_sets, ct_records])
コード例 #24
0
def list_set_records(setSpec):
    set_recs = []
    sickle = Sickle(admin.get_repository_url())
    try:
        recs = sickle.ListRecords(metadataPrefix='oai_dc', set=setSpec)
        for rec in recs:
            #rec = recs.next()
            set_recs.append({
            "identifier": rec.header.identifier,
            "datestamp": rec.header.datestamp,
            "setSpec": rec.header.setSpecs,
            "dc": rec.metadata,
        })
    except Exception as e:
        pass
    #return [rec_type, rec.metadata, rec.header.identifier, rec.header.setSpecs, rec.header.datestamp, rec.header.deleted, rec.raw]
    return set_recs
コード例 #25
0
    def run(self):
        """
        Run the process for update Pre-prints in Solr.
        """

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        else:

            print("Indexing in {0}".format(self.solr.url))

            sickle = Sickle(self.args.oai_url, verify=False)

            filters = {'metadataPrefix': 'oai_dc'}

            if self.args.time:
                filters['from'] = self.from_date.strftime("%Y-%m-%dT%H:%M:%SZ")

            try:
                records = sickle.ListRecords(**filters)
            except NoRecordsMatch as e:
                print(e)
                sys.exit(0)
            else:

                for i, record in enumerate(records):
                    try:
                        xml = self.pipeline_to_xml(record.xml)
                        print("Indexing record %s with oai id: %s" %
                              (i, record.header.identifier))
                        self.solr.update(xml, commit=True)
                    except ValueError as e:
                        print("ValueError: {0}".format(e))
                        print(e)
                        continue
                    except Exception as e:
                        print("Error: {0}".format(e))
                        print(e)
                        continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
コード例 #26
0
 def test_retry_on_503(self):
     mock_response = Mock(status_code=503,
                          headers={'retry-after': '10'},
                          raise_for_status=Mock(side_effect=HTTPError))
     mock_get = Mock(return_value=mock_response)
     sleep_mock = Mock()
     with patch('time.sleep', sleep_mock):
         with patch.object(Session, 'get', mock_get):
             sickle = Sickle('url', max_retries=3, default_retry_after=0)
             try:
                 sickle.ListRecords()
             except HTTPError:
                 pass
             mock_get.assert_called_with('url',
                                         params={'verb': 'ListRecords'})
             self.assertEqual(4, mock_get.call_count)
             self.assertEqual(3, sleep_mock.call_count)
             sleep_mock.assert_called_with(10)
コード例 #27
0
    def parse_list(self, response):
        sickle = Sickle(self.url)
        params = {
            'metadataPrefix': self.format,
            'set': response.meta['set'],
            'from': response.meta['from_date'],
            'until': self.until_date,
        }
        try:
            records = sickle.ListRecords(**params)
        except NoRecordsMatch as err:
            LOGGER.warning(err)
            raise StopIteration()

        # Avoid timing out the resumption token
        # TODO: implemente a storage-based solution, to be able to handle large
        #       amounts of records.
        records = list(records)
        LOGGER.info(
            'Harvested %s record for params %s',
            len(records),
            params,
        )
        for record in records:
            rec_identifier = self.get_record_identifier(record)
            if rec_identifier in self._crawled_records:
                # avoid cross-set repeated records
                LOGGER.info('Skipping duplicated record %s', rec_identifier)
                continue

            LOGGER.debug(
                'Not skipping non-duplicated record %s',
                rec_identifier,
            )

            self._crawled_records[rec_identifier] = record
            response = XmlResponse(self.url, encoding='utf-8', body=record.raw)
            selector = Selector(response, type='xml')

            try:
                yield self.parse_record(selector)
            except Exception as err:
                LOGGER.error(err)
コード例 #28
0
class DOABOAIClient():
    def __init__(self):
        self._sickle = Sickle(const.DOAB_OAI_ENDPOINT)

    def fetch_records_for_publisher_id(self, publisher_id):
        return self._fetch_records(publisher_id=publisher_id)

    def fetch_all_records(self):
        return self._fetch_records()

    def _fetch_records(self, publisher_id=None):
        kwargs = {
            "metadataPrefix": "oai_dc",
        }
        if publisher_id is not None:
            kwargs["set"] = f"publisher_{publisher_id}"

        return (DOABRecord(record)
                for record in self._sickle.ListRecords(**kwargs)
                if record_is_active_book(record))
コード例 #29
0
    def run(self):
        """
        Run the process for update Pre-prints in Solr.
        """

        if self.args.delete:

            self.solr.delete(self.args.delete, commit=True)

        else:

            logger.info("Indexing in {0}".format(self.solr.url))

            sickle = Sickle(self.args.oai_url)

            records = sickle.ListRecords(
                **{
                    'metadataPrefix': 'oai_dc',
                    'from': self.from_date.strftime("%Y-%m-%dT%H:%M:%SZ")
                })

            for record in records:
                try:
                    xml = self.pipeline_to_xml(record.xml)
                    self.solr.update(xml, commit=True)
                except ValueError as e:
                    logger.error("ValueError: {0}".format(e))
                    logger.exception(e)
                    continue
                except Exception as e:
                    logger.error("Error: {0}".format(e))
                    logger.exception(e)
                    continue

        # optimize the index
        self.solr.commit()
        self.solr.optimize()
コード例 #30
0
    # forces to output the result of the print command immediately, see: http://stackoverflow.com/questions/230751/how-to-flush-output-of-python-print
    sys.stdout.flush()


runningFromWithinStabi = False
# main PPN harvesting
savedRecords = []

if runningFromWithinStabi:
    proxy = urllib.request.ProxyHandler({})
    opener = urllib.request.build_opener(proxy)
    urllib.request.install_opener(opener)

# create OAI-PMH reader pointing to the Stabi OAI-PMH endpoint of the digitzed collections
sickle = Sickle('http://digital.staatsbibliothek-berlin.de/oai')
records = sickle.ListRecords(metadataPrefix='oai_dc', set='DC_all')

if True:
    printLog("Starting OAI-PMH record download...")
    # initialize some variables for counting and saving the metadata records
    savedDocs = 0

    maxDocs = 146000  # 100 is just for testing, for more interesting results increase this value to 1000. ATTENTION! this will also take more time for reading data.

    # save the records locally as we don't want to have to rely on a connection to the OAI-PMH server all the time
    # iterate over all records until maxDocs is reached
    # ATTENTION! if you re-run this cell, the contents of the savedRecords array will be altered!
    for record in records:
        # check if we reach the maximum document value
        if savedDocs < maxDocs:
            savedDocs = savedDocs + 1