def test_model_based_harvesting_list(app, sample_config, sample_list_xml): """Test harvesting using model.""" from invenio_oaiharvester.utils import get_oaiharvest_object responses.add(responses.GET, re.compile(r'http?://export.arxiv.org/oai2.*set=physics.*'), body=sample_list_xml, content_type='text/xml') with app.app_context(): source = get_oaiharvest_object(sample_config) last_updated = source.lastrun time.sleep(0.1) # to allow for date checking to work better _, records = list_records(name=sample_config) assert len(records) == 150 assert last_updated < get_oaiharvest_object(sample_config).lastrun
def oai_set_last_run(name, date, verbose=False): """Sets the lastrun for a OAI harvest configuration. :param name: name of the OAI harvest configuration. :param date: Date to set as last run :return: datetime of date to set. """ try: oai_source = get_oaiharvest_object(name) lastrun_date = date if isinstance(date, str): lastrun_date = parser.parse(date) oai_source.update_lastrun(lastrun_date) oai_source.save() db.session.commit() if verbose: click.echo('OAI {name}: set last run: {last_run}'.format( name=name, last_run=lastrun_date )) return lastrun_date except InvenioOAIHarvesterConfigNotFound: if verbose: click.echo(('ERROR OAI config not found: {name}').format( name=name, )) except parser.ParserError as err: if verbose: click.echo(('OAI set lastrun {name}: {err}').format( name=name, err=err )) return None
def test_model_based_harvesting_list(app, sample_config, sample_list_xml): """Test harvesting using model.""" from invenio_oaiharvester.utils import get_oaiharvest_object responses.add( responses.GET, re.compile(r'http?://export.arxiv.org/oai2.*set=physics.*'), body=sample_list_xml, content_type='text/xml' ) with app.app_context(): source = get_oaiharvest_object(sample_config) last_updated = source.lastrun time.sleep(0.1) # to allow for date checking to work better _, records = list_records(name=sample_config) assert len(records) == 150 assert last_updated < get_oaiharvest_object(sample_config).lastrun
def test_model_based_harvesting_list(self): """Test harvesting using model.""" from invenio_oaiharvester.utils import get_oaiharvest_object source = get_oaiharvest_object('arXiv') last_updated = source.lastrun raw_physics_xml = open(os.path.join( os.path.dirname(__file__), "data/sample_arxiv_response_listrecords_physics.xml" )).read() responses.add( responses.GET, re.compile(r'http?://export.arxiv.org/oai2.*set=physics&.*'), body=raw_physics_xml, content_type='text/xml' ) _, records = list_records(name='arXiv') self.assertTrue(len(records) == 150) self.assertTrue(last_updated < get_oaiharvest_object('arXiv').lastrun)
def oai_get_last_run(name, verbose=False): """Gets the lastrun for a OAI harvest configuration. :param name: name of the OAI harvest configuration. :return: datetime of last OAI harvest run. """ try: oai_source = get_oaiharvest_object(name) lastrun_date = oai_source.lastrun if verbose: click.echo(f'OAI {name}: last run: {lastrun_date}') return lastrun_date except InvenioOAIHarvesterConfigNotFound: if verbose: click.echo((f'ERROR OAI config not found: {name}')) return None
def oai_process_records_from_dates(name, sickle, oai_item_iterator, transformation, record_cls, max_retries=0, access_token=None, days_spann=30, from_date=None, until_date=None, ignore_deleted=False, dbcommit=True, reindex=True, test_md5=True, online=False, verbose=False, debug=False, **kwargs): """Harvest multiple records from an OAI repo. :param name: The name of the OAIHarvestConfig to use instead of passing specific parameters. :param from_date: The lower bound date for the harvesting (optional). :param until_date: The upper bound date for the harvesting (optional). """ # data on IDREF Servers starts on 2000-10-01 if kwargs.get('kwargs', {}).get('online'): online = kwargs.get('kwargs', {}).get('online') name = name days_spann = days_spann last_run = None url, metadata_prefix, last_run, setspecs = get_info_by_oai_name(name) request = sickle(url, iterator=oai_item_iterator, max_retries=max_retries) dates_inital = { 'from': from_date or last_run, 'until': until_date } update_last_run = from_date is None and until_date is None # Sanity check if dates_inital['until'] is not None \ and dates_inital['from'] > dates_inital['until']: raise WrongDateCombination("'Until' date larger than 'from' date.") last_run_date = datetime.now() # If we don't have specifications for set searches the setspecs will be # set to e list with None to go into the retrieval loop without # a set definition (line 177) setspecs = setspecs.split() or [None] count = 0 action_count = {} mef_action_count = {} viaf_online_count = 0 for spec in setspecs: dates = dates_inital params = { 'metadataPrefix': metadata_prefix, 'ignore_deleted': ignore_deleted } if access_token: params['accessToken'] = access_token params.update(dates) if spec: params['set'] = spec my_from_date = parser.parse(dates['from']) my_until_date = last_run_date if dates['until']: my_until_date = parser.parse(dates['until']) while my_from_date <= my_until_date: until_date = my_from_date + timedelta(days=days_spann) if until_date > my_until_date: until_date = my_until_date dates = { 'from': my_from_date.strftime("%Y-%m-%d"), 'until': until_date.strftime("%Y-%m-%d") } params.update(dates) try: for record in request.ListRecords(**params): count += 1 records = parse_xml_to_array(StringIO(record.raw)) try: try: updated = datetime.strptime( records[0]['005'].data, '%Y%m%d%H%M%S.%f' ) except: updated = '????' rec = transformation(records[0]).json pid = rec.get('pid') rec, action, m_record, m_action, v_record, v_online = \ record_cls.create_or_update_agent_mef_viaf( data=rec, dbcommit=True, reindex=True, online=online, verbose=verbose ) action_count.setdefault(action.name, 0) action_count[action.name] += 1 mef_action_count.setdefault(m_action.name, 0) mef_action_count[m_action.name] += 1 if v_online: viaf_online_count += 1 if verbose: m_pid = 'Non' if m_record: m_pid = m_record.pid v_pid = 'Non' if v_record: v_pid = v_record.pid click.echo( ( 'OAI {name} spec({spec}): {pid}' ' updated: {updated} {action}' ' | mef: {m_pid} {m_action}' ' | viaf: {v_pid} online: {online}' ).format( name=name, spec=spec, pid=pid, action=action.value, m_pid=m_pid, m_action=m_action.value, v_pid=v_pid, online=v_online, updated=updated ) ) except Exception as err: msg = 'ERROR creating {name} {count}: {err}' msg = msg.format( name=name, count=count, err=err ) if rec: msg += '\n{rec}'.format(rec=rec) current_app.logger.error(msg) if debug: traceback.print_exc() except NoRecordsMatch: my_from_date = my_from_date + timedelta(days=days_spann + 1) continue except Exception as err: current_app.logger.error(err) if debug: traceback.print_exc() count = -1 my_from_date = my_from_date + timedelta(days=days_spann + 1) if verbose: click.echo( ('OAI {name} {spec}: {from_d} .. +{days_spann}').format( name=name, spec=spec, from_d=my_from_date.strftime("%Y-%m-%d"), days_spann=days_spann ) ) if update_last_run: if verbose: click.echo( ('OAI {name}: update last run: {last_run}').format( name=name, last_run=last_run_date ) ) oai_source = get_oaiharvest_object(name) oai_source.update_lastrun(last_run_date) oai_source.save() db.session.commit() return count, action_count, mef_action_count