Ejemplo n.º 1
0
def bulk_indexer(pid_type, object_uuids, req_timeout):
    exceptions = []
    try:
        if not bulk_app:
            bulk_app.append(create_api())

        with bulk_app[0].app_context():
            endpoint: RecordEndpointConfiguration = current_drafts.endpoint_for_pid_type(
                pid_type)
            record_class = endpoint.record_class
            indexer_class = endpoint.indexer_class

            indexer = indexer_class()
            # force record class
            indexer.record_cls = record_class

            def get_indexing_data(record_uuid):
                try:
                    return indexer._index_action({"id": record_uuid})
                except Exception as e:
                    exceptions.append({
                        'record_uuid': str(record_uuid),
                        'message': str(e),
                        'traceback': traceback.format_exc(),
                    })
                return {}

            recs = (get_indexing_data(record_uuid)
                    for record_uuid in object_uuids)

            success, errors = bulk(indexer.client,
                                   recs,
                                   stats_only=False,
                                   request_timeout=req_timeout,
                                   expand_action_callback=_es7_expand_action,
                                   raise_on_error=False)

            return success, [*errors, *exceptions]
    except Exception as e:
        if len(object_uuids) == 1:
            return 0, [{
                'message': str(e),
                'traceback': traceback.format_exc()
            }, *exceptions]
        else:
            # index what could be indexed
            ok = 0
            errors = []
            if len(object_uuids) > 4:
                # split into two halves and try for each half
                mid = len(object_uuids) / 2
                object_uuids = [object_uuids[:mid], object_uuids[mid:]]
            else:
                # try for each element
                object_uuids = [[x] for x in object_uuids]
            for uuids in object_uuids:
                p_ok, p_errors = bulk_indexer(pid_type, uuids, req_timeout)
                ok += p_ok
                errors.extend(p_errors)
            return ok, errors
Ejemplo n.º 2
0
def article_reindex(ctx, raise_on_error=True, only=None):
    version_type = None  # elasticsearch version to use
    api = create_api()
    with api.app_context():
        def reindex_pid(pid_type, RecordClass):
            index_name = None
            indexer = RecordIndexer()
            for pid in tqdm.tqdm(PersistentIdentifier.query.filter_by(
                    pid_type=pid_type, object_type='rec', status=PIDStatus.REGISTERED.value)):
                record = RecordClass.get_record(pid.object_uuid)
                if only and str(record.id) != only:
                    continue
                try:
                    index_name, doc_type = indexer.record_to_index(record)
                    index_name = build_alias_name(index_name)
                    # print('Indexing', record.get('id'), 'into', index_name)
                    indexer.index(record)
                except:
                    with open('/tmp/indexing-error.json', 'a') as f:
                        print(json.dumps(record.dumps(), indent=4, ensure_ascii=False), file=f)
                        traceback.print_exc(file=f)
                    if raise_on_error:
                        raise
            if index_name:
                current_search_client.indices.refresh(index_name)
                current_search_client.indices.flush(index_name)

        # reindex all objects
        reindex_pid(ARTICLE_PID_TYPE, ArticleRecord)
        reindex_pid(ARTICLE_DRAFT_PID_TYPE, ArticleDraftRecord)
Ejemplo n.º 3
0
def run(provider,
        synchronizer,
        break_on_error,
        start_oai,
        start_id,
        oai,
        overwrite,
        bulk,
        only_fetch,
        index: str = None):
    """
    Starts harvesting the resources set in invenio.cfg through the OAREPO_OAI_PROVIDERS
    environment variable.
    """
    api = create_api()
    with api.app_context():
        _run_internal(provider=provider,
                      synchronizer=synchronizer,
                      break_on_error=break_on_error,
                      start_oai=start_oai,
                      start_id=start_id,
                      oai=oai,
                      overwrite=overwrite,
                      bulk=bulk,
                      only_fetch=only_fetch,
                      index=index)
Ejemplo n.º 4
0
def demo_reindex(only=None):
    with create_api().app_context():

        def reindex_pid(pid_type):
            for pid in PersistentIdentifier.query.filter_by(pid_type=pid_type,
                                                            object_type='rec'):
                record = Record.get_record(pid.object_uuid)
                if only and str(record.id) != only:
                    continue
                try:
                    RecordIndexer().index(record)
                except:
                    with open('/tmp/indexing-error.json', 'w') as f:
                        print(json.dumps(record.dumps(),
                                         indent=4,
                                         ensure_ascii=False),
                              file=f)
                    raise
            current_search_client.indices.flush()

        reindex_pid('recid')
Ejemplo n.º 5
0
def nr_recommit(ctx):
    api = create_api()
    with api.app_context():
        endpoints = current_app.config.get("RECORDS_REST_ENDPOINTS").endpoints
        for config in endpoints.values():
            try:
                pid_type: str = config["pid_type"]
                print(f'PID type: {pid_type}')
                record_class = obj_or_import_string(config["record_class"])
                pids = PersistentIdentifier.query.filter_by(
                    pid_type=pid_type).all()
                for i, pid in enumerate(tqdm(pids)):
                    try:
                        record = record_class.get_record(pid.object_uuid)
                    except NoResultFound:
                        continue
                    t0 = datetime.now()
                    record.commit()
                    print(f"Commiting time: {datetime.now() - t0}")
                    if i % 100 == 0:
                        db.session.commit()
            finally:
                db.session.commit()
Ejemplo n.º 6
0
        original_script_name = environ.get("SCRIPT_NAME", "")
        environ["SCRIPT_NAME"] = original_script_name + script
        environ["PATH_INFO"] = path_info
        return self.app(environ, start_response)


class HeartbeatMiddleware:
    """HeartBeat endpoints WSGI middleware."""
    def __init__(self, app):
        """Initialize heartbeat middleware."""
        self.app = app

    def __call__(self, environ, start_response):
        """Handle .well-known endpoints outside of /api prefix."""
        rsp = None
        with application.app_context():
            pi = environ.get('PATH_INFO', '')
            if pi == '/.well-known/heartbeat/readiness':
                rsp = readiness()
            elif pi == '/.well-known/heartbeat/liveliness':
                rsp = liveliness()
            if rsp:
                return rsp(environ, start_response)
            else:
                return self.app(environ, start_response)


application = create_api()
application.wsgi_app = HeartbeatMiddleware(
    PrefixMiddleware(application.wsgi_app))
Ejemplo n.º 7
0
import itertools
import time
from multiprocessing.pool import Pool
from random import random

from invenio_app.factory import create_api
from invenio_db import db
from invenio_records.models import RecordMetadata
from sqlalchemy.orm.attributes import flag_modified
from tqdm import tqdm

app = create_api()


def set_providers(*mds):
    with app.app_context():
        with db.session.begin_nested():
            for md in RecordMetadata.query.filter(RecordMetadata.id.in_(mds)):
                control_number = md.json.get('control_number')
                print(f"Control number: {control_number}")
                providers = md.json.get('provider')

                primary_community = None

                if providers:
                    provider = None
                    for p in providers:
                        if not provider or p['level'] > provider['level']:
                            provider = p
                    provider = providers[0]
                    self_link = provider.get('links', {}).get('self')
Ejemplo n.º 8
0
def setup(admin_password,
          recreate_db,
          skip_demo_data,
          skip_file_location,
          drop_taxonomies,
          skip_taxonomy_import,
          verbose,
          taxonomies='./assets/taxonomy'):
    """OARepo setup command."""
    from flask import current_app
    from invenio_base.app import create_cli

    click.secho("oarepo setup started...", fg="blue")

    # Clean redis
    redis.StrictRedis.from_url(
        current_app.config["CACHE_REDIS_URL"]).flushall()
    click.secho("redis cache cleared...", fg="red")

    cli = create_cli()

    # Important: force API app on CLI context for proper URL generation
    cli.create_app = create_api
    runner = create_api().test_cli_runner()

    def run_command(command, catch_exceptions=False):
        click.secho("oarepo {}...".format(command), fg="green")
        res = runner.invoke(cli, command, catch_exceptions=catch_exceptions)
        if verbose:
            click.secho(res.output)

    # Print all routes considered for URL generation
    run_command('routes')

    # Remove and create db and indexes
    if recreate_db:
        run_command("db destroy --yes-i-know", catch_exceptions=True)
        run_command("db init")
    else:
        run_command("db drop --yes-i-know")
    run_command("db create")
    run_command("index destroy --force --yes-i-know")
    run_command("index init --force")
    run_command("index queue init purge")

    # Create roles to restrict access
    run_command("roles create admin")

    # Create users
    run_command("users create [email protected] -a --password={}".format(
        admin_password))  # ID 1
    create_userprofile_for("*****@*****.**", "admin", "OArepo Administrator")

    # Assign roles
    run_command("roles add [email protected] admin")

    # Assign actions
    run_command("access allow superuser-access role admin")

    # Create files location
    if not skip_file_location:
        run_command("files location --default oarepo /tmp/oarepo")

    # Create ACLs index for preferred SCHEMA
    run_command("invenio invenio_explicit_acls prepare {}".format(
        ACL_PREFERRED_SCHEMA))

    # Drop taxonomy data
    if drop_taxonomies:
        taxo_list = runner.invoke(cli,
                                  'taxonomies list',
                                  catch_exceptions=False)
        click.secho("oarepo dropping existing taxonomies {}".format(
            taxo_list.output),
                    fg="yellow")
        for tax in [
                t for t in taxo_list.output.splitlines()
                if t[0] not in [' ', '*']
        ]:
            click.secho("oarepo deleting taxonomy {}".format(tax), fg="yellow")
            run_command('taxonomies delete {}'.format(tax))

    # Import taxonomies
    if not skip_taxonomy_import:
        import os
        click.secho("oarepo importing taxonomies from {}".format(taxonomies),
                    fg="green")
        for tax_file in os.listdir(taxonomies):
            if tax_file.endswith('xlsx'):
                tax_path = os.path.join(taxonomies, tax_file)
                click.secho("oarepo importing taxonomy {}".format(tax_path),
                            fg="green")
                if tax_file.startswith('event'):
                    run_command(
                        'taxonomies import {} --str web --str organizer --str startDate --str endDate --bool '
                        'selectable --drop'.format(tax_path))
                elif tax_file.startswith('format'):
                    run_command(
                        'taxonomies import {} --str resolution --str spec --bool selectable --drop'
                        .format(tax_path))

        click.secho("oarepo setting all-read permission on taxonomies",
                    fg="green")
        run_command('taxonomies all-read')
        # TODO: what about taxonomy modify?

        run_command('demo data')

    click.secho("oarepo setup finished successfully", fg="blue")
Ejemplo n.º 9
0
def nr_update_access_rights(ctx):
    api = create_api()
    with api.app_context():
        update_access_rights(deep=True)
        _reindex(None, raise_on_error=True, only=None)
Ejemplo n.º 10
0
def nr_reindex(ctx, pids, raise_on_error=True, only=None):
    version_type = None  # elasticsearch version to use
    api = create_api()
    with api.app_context():
        _reindex(pids, raise_on_error=raise_on_error, only=only)