def test_model_based_harvesting_list(app, sample_config, sample_list_xml):
    """Test harvesting using model."""
    from invenio_oaiharvester.utils import get_oaiharvest_object
    responses.add(responses.GET,
                  re.compile(r'http?://export.arxiv.org/oai2.*set=physics.*'),
                  body=sample_list_xml,
                  content_type='text/xml')
    with app.app_context():
        source = get_oaiharvest_object(sample_config)
        last_updated = source.lastrun
        time.sleep(0.1)  # to allow for date checking to work better
        _, records = list_records(name=sample_config)

        assert len(records) == 150
        assert last_updated < get_oaiharvest_object(sample_config).lastrun
Example #2
0
def oai_set_last_run(name, date, verbose=False):
    """Sets the lastrun for a OAI harvest configuration.

    :param name: name of the OAI harvest configuration.
    :param date: Date to set as last run
    :return: datetime of date to set.
    """
    try:
        oai_source = get_oaiharvest_object(name)
        lastrun_date = date
        if isinstance(date, str):
            lastrun_date = parser.parse(date)
        oai_source.update_lastrun(lastrun_date)
        oai_source.save()
        db.session.commit()
        if verbose:
            click.echo('OAI {name}: set last run: {last_run}'.format(
                name=name,
                last_run=lastrun_date
            ))
        return lastrun_date
    except InvenioOAIHarvesterConfigNotFound:
        if verbose:
            click.echo(('ERROR OAI config not found: {name}').format(
                name=name,
            ))
    except parser.ParserError as err:
        if verbose:
            click.echo(('OAI set lastrun {name}: {err}').format(
                name=name,
                err=err
            ))
    return None
def test_model_based_harvesting_list(app, sample_config, sample_list_xml):
    """Test harvesting using model."""
    from invenio_oaiharvester.utils import get_oaiharvest_object
    responses.add(
        responses.GET,
        re.compile(r'http?://export.arxiv.org/oai2.*set=physics.*'),
        body=sample_list_xml,
        content_type='text/xml'
    )
    with app.app_context():
        source = get_oaiharvest_object(sample_config)
        last_updated = source.lastrun
        time.sleep(0.1)  # to allow for date checking to work better
        _, records = list_records(name=sample_config)

        assert len(records) == 150
        assert last_updated < get_oaiharvest_object(sample_config).lastrun
    def test_model_based_harvesting_list(self):
        """Test harvesting using model."""
        from invenio_oaiharvester.utils import get_oaiharvest_object

        source = get_oaiharvest_object('arXiv')
        last_updated = source.lastrun

        raw_physics_xml = open(os.path.join(
            os.path.dirname(__file__), "data/sample_arxiv_response_listrecords_physics.xml"
        )).read()

        responses.add(
            responses.GET,
            re.compile(r'http?://export.arxiv.org/oai2.*set=physics&.*'),
            body=raw_physics_xml,
            content_type='text/xml'
        )

        _, records = list_records(name='arXiv')

        self.assertTrue(len(records) == 150)
        self.assertTrue(last_updated < get_oaiharvest_object('arXiv').lastrun)
Example #5
0
def oai_get_last_run(name, verbose=False):
    """Gets the lastrun for a OAI harvest configuration.

    :param name: name of the OAI harvest configuration.
    :return: datetime of last OAI harvest run.
    """
    try:
        oai_source = get_oaiharvest_object(name)
        lastrun_date = oai_source.lastrun
        if verbose:
            click.echo(f'OAI {name}: last run: {lastrun_date}')
        return lastrun_date
    except InvenioOAIHarvesterConfigNotFound:
        if verbose:
            click.echo((f'ERROR OAI config not found: {name}'))
        return None
Example #6
0
def oai_process_records_from_dates(name, sickle, oai_item_iterator,
                                   transformation, record_cls, max_retries=0,
                                   access_token=None, days_spann=30,
                                   from_date=None, until_date=None,
                                   ignore_deleted=False, dbcommit=True,
                                   reindex=True, test_md5=True, online=False,
                                   verbose=False, debug=False, **kwargs):
    """Harvest multiple records from an OAI repo.

    :param name: The name of the OAIHarvestConfig to use instead of passing
                 specific parameters.
    :param from_date: The lower bound date for the harvesting (optional).
    :param until_date: The upper bound date for the harvesting (optional).
    """
    # data on IDREF Servers starts on 2000-10-01
    if kwargs.get('kwargs', {}).get('online'):
        online = kwargs.get('kwargs', {}).get('online')
    name = name
    days_spann = days_spann
    last_run = None
    url, metadata_prefix, last_run, setspecs = get_info_by_oai_name(name)

    request = sickle(url, iterator=oai_item_iterator, max_retries=max_retries)

    dates_inital = {
        'from': from_date or last_run,
        'until': until_date
    }
    update_last_run = from_date is None and until_date is None
    # Sanity check
    if dates_inital['until'] is not None \
            and dates_inital['from'] > dates_inital['until']:
        raise WrongDateCombination("'Until' date larger than 'from' date.")

    last_run_date = datetime.now()

    # If we don't have specifications for set searches the setspecs will be
    # set to e list with None to go into the retrieval loop without
    # a set definition (line 177)
    setspecs = setspecs.split() or [None]
    count = 0
    action_count = {}
    mef_action_count = {}
    viaf_online_count = 0
    for spec in setspecs:
        dates = dates_inital
        params = {
            'metadataPrefix': metadata_prefix,
            'ignore_deleted': ignore_deleted
        }
        if access_token:
            params['accessToken'] = access_token
        params.update(dates)
        if spec:
            params['set'] = spec

        my_from_date = parser.parse(dates['from'])
        my_until_date = last_run_date
        if dates['until']:
            my_until_date = parser.parse(dates['until'])
        while my_from_date <= my_until_date:
            until_date = my_from_date + timedelta(days=days_spann)
            if until_date > my_until_date:
                until_date = my_until_date
            dates = {
                'from': my_from_date.strftime("%Y-%m-%d"),
                'until': until_date.strftime("%Y-%m-%d")
            }
            params.update(dates)

            try:
                for record in request.ListRecords(**params):
                    count += 1
                    records = parse_xml_to_array(StringIO(record.raw))
                    try:
                        try:
                            updated = datetime.strptime(
                                records[0]['005'].data,
                                '%Y%m%d%H%M%S.%f'
                            )
                        except:
                            updated = '????'
                        rec = transformation(records[0]).json
                        pid = rec.get('pid')
                        rec, action, m_record, m_action, v_record, v_online = \
                            record_cls.create_or_update_agent_mef_viaf(
                                data=rec,
                                dbcommit=True,
                                reindex=True,
                                online=online,
                                verbose=verbose
                            )
                        action_count.setdefault(action.name, 0)
                        action_count[action.name] += 1
                        mef_action_count.setdefault(m_action.name, 0)
                        mef_action_count[m_action.name] += 1
                        if v_online:
                            viaf_online_count += 1

                        if verbose:
                            m_pid = 'Non'
                            if m_record:
                                m_pid = m_record.pid
                            v_pid = 'Non'
                            if v_record:
                                v_pid = v_record.pid
                            click.echo(
                                (
                                    'OAI {name} spec({spec}): {pid}'
                                    ' updated: {updated} {action}'
                                    ' | mef: {m_pid} {m_action}'
                                    ' | viaf: {v_pid} online: {online}'
                                ).format(
                                    name=name,
                                    spec=spec,
                                    pid=pid,
                                    action=action.value,
                                    m_pid=m_pid,
                                    m_action=m_action.value,
                                    v_pid=v_pid,
                                    online=v_online,
                                    updated=updated
                                )
                            )
                    except Exception as err:
                        msg = 'ERROR creating {name} {count}: {err}'
                        msg = msg.format(
                            name=name,
                            count=count,
                            err=err
                        )
                        if rec:
                            msg += '\n{rec}'.format(rec=rec)

                        current_app.logger.error(msg)
                        if debug:
                            traceback.print_exc()
            except NoRecordsMatch:
                my_from_date = my_from_date + timedelta(days=days_spann + 1)
                continue
            except Exception as err:
                current_app.logger.error(err)
                if debug:
                    traceback.print_exc()
                count = -1

            my_from_date = my_from_date + timedelta(days=days_spann + 1)
            if verbose:
                click.echo(
                    ('OAI {name} {spec}: {from_d} .. +{days_spann}').format(
                        name=name,
                        spec=spec,
                        from_d=my_from_date.strftime("%Y-%m-%d"),
                        days_spann=days_spann
                    )
                )

    if update_last_run:
        if verbose:
            click.echo(
                ('OAI {name}: update last run: {last_run}').format(
                    name=name,
                    last_run=last_run_date
                )
            )
        oai_source = get_oaiharvest_object(name)
        oai_source.update_lastrun(last_run_date)
        oai_source.save()
        db.session.commit()
    return count, action_count, mef_action_count