Ejemplo n.º 1
0
Archivo: tasks.py Proyecto: xbee/zenodo
def update_record_statistics(start_date=None, end_date=None):
    """Update "_stats" field of affected records."""
    start_date = dateutil_parse(start_date) if start_date else None
    end_date = dateutil_parse(end_date) if start_date else None
    aggr_configs = {}

    if not start_date and not end_date:
        start_date = datetime.utcnow()
        end_date = datetime.utcnow()

        for aggr_name, aggr_cfg in current_stats.aggregations.items():
            aggr = aggr_cfg.cls(name=aggr_cfg.name, **aggr_cfg.params)
            if not Index(aggr.index, using=aggr.client).exists():
                if not Index(aggr.event_index, using=aggr.client).exists():
                    start_date = min(start_date, datetime.utcnow())
                else:
                    start_date = min(start_date,
                                     aggr._get_oldest_event_timestamp())

            # Retrieve the last two bookmarks
            bookmarks = aggr.list_bookmarks(limit=2)
            if len(bookmarks) >= 1:
                end_date = max(
                    end_date,
                    datetime.strptime(bookmarks[0].date, aggr.doc_id_suffix))
            if len(bookmarks) == 2:
                start_date = min(
                    start_date,
                    datetime.strptime(bookmarks[1].date, aggr.doc_id_suffix))

            aggr_configs[aggr.index] = aggr
    elif start_date and end_date:
        for aggr_name, aggr_cfg in current_stats.aggregations.items():
            aggr = aggr_cfg.cls(name=aggr_cfg.name, **aggr_cfg.params)
            aggr_configs[aggr.index] = aggr
    else:
        return

    # Get conceptrecids for all the affected records between the two dates
    conceptrecids = set()
    for aggr_index, aggr in aggr_configs.items():
        query = Search(
            using=aggr.client,
            index=aggr.index,
            doc_type=aggr.doc_type,
        ).filter('range',
                 timestamp={
                     'gte':
                     start_date.replace(microsecond=0).isoformat() + '||/d',
                     'lte':
                     end_date.replace(microsecond=0).isoformat() + '||/d'
                 }).source(include='conceptrecid')
        conceptrecids |= {b.conceptrecid for b in query.scan()}

    indexer = RecordIndexer()
    for concpetrecid_val in conceptrecids:
        conceptrecid = PersistentIdentifier.get('recid', concpetrecid_val)
        pv = PIDVersioning(parent=conceptrecid)
        children_recids = pv.children.all()
        indexer.bulk_index([str(p.object_uuid) for p in children_recids])
Ejemplo n.º 2
0
def update_expired_embargoes():
    """Release expired embargoes every midnight."""
    logger = current_app.logger
    base_url = urlunsplit((
        current_app.config.get('PREFERRED_URL_SCHEME', 'http'),
        current_app.config['JSONSCHEMAS_HOST'],
        current_app.config.get('APPLICATION_ROOT') or '', '', ''
    ))
    # The task needs to run in a request context as JSON Schema validation
    # will use url_for.
    with current_app.test_request_context('/', base_url=base_url):
        s = B2ShareRecordsSearch(
            using=current_search_client,
            index='records'
        ).query(
            'query_string',
            query='open_access:false AND embargo_date:{{* TO {0}}}'.format(
                datetime.now(timezone.utc).isoformat()
            ),
            allow_leading_wildcard=False
        ).fields([])
        record_ids = [hit.meta.id for hit in s.scan()]
        if record_ids:
            logger.info('Changing access of {} embargoed publications'
                        ' to public.'.format(len(record_ids)))
        for record in Record.get_records(record_ids):
            logger.debug('Making embargoed publication {} public'.format(
                record.id))
            record['open_access'] = True
            record.commit()
        db.session.commit()

        indexer = RecordIndexer()
        indexer.bulk_index(record_ids)
        indexer.process_bulk_queue()
Ejemplo n.º 3
0
def demo_init():
    """Initialize demo site."""
    from flask import current_app
    records = []
    # Import bibliographic records
    click.secho('Importing bibliographic records', fg='green')
    records += import_records(
        marc21,
        current_app.extensions['invenio-jsonschemas'].path_to_url(
            'marc21/bibliographic/bd-v1.0.2.json'),
        pkg_resources.resource_filename('invenio_records',
                                        'data/marc21/bibliographic.xml'),
    )
    # FIXME add support for authority records.
    # Import authority records
    # click.secho('Importing authority records', fg='green')
    # records += import_records(
    #     marc21_authority,
    #     current_app.extensions['invenio-jsonschemas'].path_to_url(
    #         'marc21/authority/ad-v1.0.2.json'),
    #     pkg_resources.resource_filename(
    #         'invenio_records', 'data/marc21/authority.xml'),
    # )
    db.session.commit()
    # Index all records
    click.secho('Indexing records', fg='green')
    indexer = RecordIndexer()
    indexer.bulk_index(records)
    indexer.process_bulk_queue()
Ejemplo n.º 4
0
def create_fake_record(bulk_size, fake):
    """Create records for demo purposes."""
    records_bulk = []
    start = timeit.default_timer()
    for _ in range(bulk_size):
        # Create fake record metadata
        record_data = {
            "contributors": [{
                "name": fake.name()
            }],
            "description": fake.bs(),
            "title": fake.company() + "'s dataset",
        }

        # Create record in DB
        rec_uuid = uuid.uuid4()
        current_pidstore.minters["recid"](rec_uuid, record_data)
        Record.create(record_data, id_=rec_uuid)

        # Add record for bulk indexing
        records_bulk.append(rec_uuid)

    # Flush to index and database
    db.session.commit()
    click.secho(f"Writing {bulk_size} records to the database", fg="green")

    # Bulk index records
    ri = RecordIndexer()
    ri.bulk_index(records_bulk)
    current_search.flush_and_refresh(index="records")
    click.secho(f"Sending {bulk_size} records to be indexed", fg="green")
    stop = timeit.default_timer()
    click.secho(f"Creating {bulk_size} records took {stop - start}.",
                fg="green")
Ejemplo n.º 5
0
def test_indexer_bulk_index(app, queue):
    """Test delay indexing."""
    with app.app_context():
        with establish_connection() as c:
            indexer = RecordIndexer()
            id1 = uuid.uuid4()
            id2 = uuid.uuid4()
            indexer.bulk_index([id1, id2])
            indexer.bulk_delete([id1, id2])

            consumer = Consumer(
                connection=c,
                queue=indexer.mq_queue.name,
                exchange=indexer.mq_exchange.name,
                routing_key=indexer.mq_routing_key)

            messages = list(consumer.iterqueue())
            [m.ack() for m in messages]

            assert len(messages) == 4
            data0 = messages[0].decode()
            assert data0['id'] == str(id1)
            assert data0['op'] == 'index'
            data2 = messages[2].decode()
            assert data2['id'] == str(id1)
            assert data2['op'] == 'delete'
Ejemplo n.º 6
0
def update_expired_embargos():
    """Release expired embargoes every midnight."""
    logger = current_app.logger
    base_url = urlunsplit(
        (current_app.config.get('PREFERRED_URL_SCHEME', 'http'),
         current_app.config['JSONSCHEMAS_HOST'],
         current_app.config.get('APPLICATION_ROOT') or '', '', ''))
    # The task needs to run in a request context as JSON Schema validation
    # will use url_for.
    with current_app.test_request_context('/', base_url=base_url):
        s = B2ShareRecordsSearch(
            using=current_search_client, index='records').query(
                'query_string',
                query='open_access:false AND embargo_date:{{* TO {0}}}'.format(
                    datetime.now(timezone.utc).isoformat()),
                allow_leading_wildcard=False).fields([])
        record_ids = [hit.meta.id for hit in s.scan()]
        if record_ids:
            logger.info('Changing access of {} embargoed publications'
                        ' to public.'.format(len(record_ids)))
        for record in Record.get_records(record_ids):
            logger.debug('Making embargoed publication {} public'.format(
                record.id))
            record['open_access'] = True
            record.commit()
        db.session.commit()

        indexer = RecordIndexer()
        indexer.bulk_index(record_ids)
        indexer.process_bulk_queue()
Ejemplo n.º 7
0
def bulk_index_records(records):
    """Bulk index a list of records."""
    indexer = RecordIndexer()

    click.echo("Bulk indexing {} records...".format(len(records)))
    indexer.bulk_index([str(r.id) for r in records])
    indexer.process_bulk_queue()
    click.echo("Indexing completed!")
Ejemplo n.º 8
0
def update_expired_embargos():
    """Release expired embargoes every midnight."""
    record_ids = AccessRight.get_expired_embargos()
    for record in Record.get_records(record_ids):
        record['access_right'] = AccessRight.OPEN
        record.commit()
    db.session.commit()

    indexer = RecordIndexer()
    indexer.bulk_index(record_ids)
    indexer.process_bulk_queue()
Ejemplo n.º 9
0
def update_expired_embargos():
    """Release expired embargoes every midnight."""
    record_ids = AccessRight.get_expired_embargos()
    for record in Record.get_records(record_ids):
        record['access_right'] = AccessRight.OPEN
        record.commit()
    db.session.commit()

    indexer = RecordIndexer()
    indexer.bulk_index(record_ids)
    indexer.process_bulk_queue()
Ejemplo n.º 10
0
def create_records(records):
    """Async records creation and indexing."""
    record_indexer = RecordIndexer()
    record_uuids = []
    for record in records:
        uid = uuid.uuid4()
        id = bibid_minter(uid, record)
        record = Record.create(record, id_=uid)
        record_uuids.append(uid)
    record_indexer.bulk_index(record_uuids)
    record_indexer.process_bulk_queue()
    db.session.commit()
Ejemplo n.º 11
0
def load_custom_records():
    """Initialize demo site."""
    from flask import current_app
    current_app.config['RECORDS_REST_DEFAULT_READ_PERMISSION_FACTORY'] = \
        allow_all
    # Import bibliographic records
    click.secho('Importing custom records', fg='green')
    records = import_records()
    db.session.commit()
    # Index all records
    click.secho('Indexing records', fg='green')
    indexer = RecordIndexer()
    indexer.bulk_index(records)
    indexer.process_bulk_queue()
Ejemplo n.º 12
0
def marc21_import(dojson_model, input):
    """Import MARCXML records."""
    from flask import current_app
    if dojson_model == marc21:
        schema = current_app.extensions['invenio-jsonschemas'].path_to_url(
            'marc21/bibliographic/bd-v1.0.0.json')
    elif dojson_model == marc21_authority:
        schema = current_app.extensions['invenio-jsonschemas'].path_to_url(
            'marc21/authority/ad-v1.0.0.json')

    # Create records
    click.secho('Importing records', fg='green')
    record_ids = import_records(dojson_model, schema, input)
    db.session.commit()

    # Index records
    click.secho('Indexing records', fg='green')
    indexer = RecordIndexer()
    indexer.bulk_index(record_ids)
    indexer.process_bulk_queue()
Ejemplo n.º 13
0
def bulk_index(uuids, process=False, verbose=False):
    """Bulk index records."""
    if verbose:
        click.echo(' add to index: {count}'.format(count=len(uuids)))
    indexer = RecordIndexer()
    retry = True
    minutes = 1
    while retry:
        try:
            indexer.bulk_index(uuids)
            retry = False
        except Exception as exc:
            msg = 'Bulk Index Error: retry in {minutes} min {exc}'.format(
                exc=exc, minutes=minutes)
            current_app.logger.error(msg)
            if verbose:
                click.secho(msg, fg='red')
            sleep(minutes * 60)
            retry = True
            minutes *= 2
    if process:
        indexer.process_bulk_queue()
Ejemplo n.º 14
0
def data(
    n_docs,
    n_items,
    n_eitems,
    n_loans,
    n_tags,
    n_intlocs,
    n_series,
    n_document_requests,
):
    """Insert demo data."""
    click.secho("Generating demo data", fg="yellow")

    indexer = RecordIndexer()

    holder = Holder(
        patrons_pids=["1", "2", "5", "6"],
        librarian_pid="4",
        total_intloc=n_intlocs,
        total_tags=n_tags,
        total_items=n_items,
        total_eitems=n_eitems,
        total_documents=n_docs,
        total_loans=n_loans,
        total_series=n_series,
        total_document_requests=n_document_requests,
    )

    click.echo("Creating locations...")
    loc_generator = LocationGenerator(holder, minter)
    loc_generator.generate()
    rec = loc_generator.persist()
    indexer.index(rec)

    # InternalLocations
    intlocs_generator = InternalLocationGenerator(holder, minter)
    intlocs_generator.generate()
    rec_intlocs = intlocs_generator.persist()

    # Tags
    click.echo("Creating tags...")
    tags_generator = TagGenerator(holder, minter)
    tags_generator.generate()
    rec_tags = tags_generator.persist()

    # Series
    click.echo("Creating series...")
    series_generator = SeriesGenerator(holder, minter)
    series_generator.generate()
    rec_series = series_generator.persist()

    # Documents
    click.echo("Creating documents...")
    documents_generator = DocumentGenerator(holder, minter)
    documents_generator.generate()
    rec_docs = documents_generator.persist()

    # Items
    click.echo("Creating items...")
    items_generator = ItemGenerator(holder, minter)
    items_generator.generate()
    rec_items = items_generator.persist()

    # EItems
    click.echo("Creating eitems...")
    eitems_generator = EItemGenerator(holder, minter)
    eitems_generator.generate()
    rec_eitems = eitems_generator.persist()

    # Loans
    click.echo("Creating loans...")
    loans_generator = LoanGenerator(holder, minter)
    loans_generator.generate()
    rec_loans = loans_generator.persist()

    # Related records
    click.echo("Creating related records...")
    related_generator = RecordRelationsGenerator(holder, minter)
    related_generator.generate(rec_docs, rec_series)
    related_generator.persist()

    # Document requests
    click.echo("Creating document requests...")
    document_requests_generator = DocumentRequestGenerator(holder, minter)
    document_requests_generator.generate()
    rec_requests = document_requests_generator.persist()

    # index locations
    indexer.bulk_index([str(r.id) for r in rec_intlocs])
    click.echo("Sent to the indexing queue {0} locations".format(
        len(rec_intlocs)))

    # index tags
    indexer.bulk_index([str(r.id) for r in rec_tags])
    click.echo("Sent to the indexing queue {0} tags".format(len(rec_tags)))
    # process queue so series can resolve tags correctly
    indexer.process_bulk_queue()

    # index series
    indexer.bulk_index([str(r.id) for r in rec_series])
    click.echo("Sent to the indexing queue {0} series".format(len(rec_series)))

    # index loans
    indexer.bulk_index([str(r.id) for r in rec_loans])
    click.echo("Sent to the indexing queue {0} loans".format(len(rec_loans)))

    click.secho("Now indexing...", fg="green")
    # process queue so items can resolve circulation status correctly
    indexer.process_bulk_queue()

    # index eitems
    indexer.bulk_index([str(r.id) for r in rec_eitems])
    click.echo("Sent to the indexing queue {0} eitems".format(len(rec_eitems)))

    # index items
    indexer.bulk_index([str(r.id) for r in rec_items])
    click.echo("Sent to the indexing queue {0} items".format(len(rec_items)))

    click.secho("Now indexing...", fg="green")
    # process queue so documents can resolve circulation correctly
    indexer.process_bulk_queue()

    # index document requests
    indexer.bulk_index([str(r.id) for r in rec_requests])
    click.echo("Sent to the indexing queue {0} document requests".format(
        len(rec_requests)))

    click.secho("Now indexing...", fg="green")
    indexer.process_bulk_queue()

    # flush all indices after indexing, otherwise ES won't be ready for tests
    current_search.flush_and_refresh(index="*")

    # index documents
    indexer.bulk_index([str(r.id) for r in rec_docs])
    click.echo("Sent to the indexing queue {0} documents".format(
        len(rec_docs)))

    # index loans again
    indexer.bulk_index([str(r.id) for r in rec_loans])
    click.echo("Sent to the indexing queue {0} loans".format(len(rec_loans)))

    click.secho("Now indexing...", fg="green")
    indexer.process_bulk_queue()
Ejemplo n.º 15
0
def update_record_statistics(start_date=None, end_date=None):
    """Update "_stats" field of affected records."""
    start_date = dateutil_parse(start_date) if start_date else None
    end_date = dateutil_parse(end_date) if start_date else None
    aggr_configs = {}

    if not start_date and not end_date:
        start_date = datetime.utcnow()
        end_date = datetime.utcnow()

        for aggr_name in current_stats.enabled_aggregations:
            aggr_cfg = current_stats.aggregations[aggr_name]
            aggr = aggr_cfg.aggregator_class(name=aggr_cfg.name,
                                             **aggr_cfg.aggregator_config)

            if not Index(aggr.aggregation_alias, using=aggr.client).exists():
                if not Index(aggr.event_index, using=aggr.client).exists():
                    start_date = min(start_date, datetime.utcnow())
                else:
                    start_date = min(start_date,
                                     aggr._get_oldest_event_timestamp())

            # Retrieve the last two bookmarks
            bookmarks = Search(using=aggr.client,
                               index=aggr.aggregation_alias,
                               doc_type=aggr.bookmark_doc_type)[0:2].sort({
                                   'date': {
                                       'order': 'desc'
                                   }
                               }).execute()

            if len(bookmarks) >= 1:
                end_date = max(
                    end_date,
                    datetime.strptime(bookmarks[0].date, aggr.doc_id_suffix))
            if len(bookmarks) == 2:
                start_date = min(
                    start_date,
                    datetime.strptime(bookmarks[1].date, aggr.doc_id_suffix))

            aggr_configs[aggr.aggregation_alias] = aggr
    elif start_date and end_date:
        for aggr_name in current_stats.enabled_aggregations:
            aggr_cfg = current_stats.aggregations[aggr_name]
            aggr = aggr_cfg.aggregator_class(name=aggr_cfg.name,
                                             **aggr_cfg.aggregator_config)
            aggr_configs[aggr.aggregation_alias] = aggr
    else:
        return

    # Get conceptrecids for all the affected records between the two dates
    conceptrecids = set()
    for aggr_alias, aggr in aggr_configs.items():
        query = Search(
            using=aggr.client,
            index=aggr.aggregation_alias,
            doc_type=aggr.aggregation_doc_type,
        ).filter('range',
                 timestamp={
                     'gte':
                     start_date.replace(microsecond=0).isoformat() + '||/d',
                     'lte':
                     end_date.replace(microsecond=0).isoformat() + '||/d'
                 }).extra(_source=False)
        query.aggs.bucket('ids', 'terms', field='conceptrecid', size=0)
        conceptrecids |= {
            b.key
            for b in query.execute().aggregations.ids.buckets
        }

    indexer = RecordIndexer()
    for concpetrecid_val in conceptrecids:
        conceptrecid = PersistentIdentifier.get('recid', concpetrecid_val)
        pv = PIDVersioning(parent=conceptrecid)
        children_recids = pv.children.all()
        indexer.bulk_index([str(p.object_uuid) for p in children_recids])
Ejemplo n.º 16
0
def data(n_docs, n_items, n_loans):
    """Insert demo data."""
    indexer = RecordIndexer()

    rec_location = create_loc_record()
    db.session.commit()
    indexer.index(rec_location)

    rec_int_locs = []
    with click.progressbar(get_internal_locations(rec_location),
                           label="Internal Locations") as ilocs:
        for iloc in ilocs:
            rec = create_iloc_record(iloc, rec_location[Location.pid_field])
            rec_int_locs.append(rec)

    documents, items = get_documents_items(rec_int_locs,
                                           n_docs=n_docs,
                                           n_items=n_items)
    rec_docs = []
    with click.progressbar(documents, label="Documents") as docs:
        for doc in docs:
            rec = create_doc_record(doc)
            rec_docs.append(rec)

    rec_items = []
    with click.progressbar(items, label="Items") as _items:
        for item in _items:
            iloc = rec_int_locs[randint(0, len(rec_int_locs) - 1)]
            rec = create_item_record(item, iloc[InternalLocation.pid_field])
            rec_items.append(rec)

    db.session.commit()

    loans = get_loans_for_items(
        rec_items,
        rec_location,
        patron_ids=["1", "2"],
        librarian_id="4",
        n_loans=n_loans,
    )
    rec_loans = []
    with click.progressbar(loans, label="Loans") as _loans:
        for _loan in _loans:
            rec = create_loan_record(_loan)
            rec_loans.append(rec)

    db.session.commit()

    # index locations
    indexer.bulk_index([str(r.id) for r in rec_int_locs])
    click.echo('Sent to the indexing queue {0} locations'.format(
        len(rec_int_locs)))

    # index loans
    indexer.bulk_index([str(r.id) for r in rec_loans])
    click.echo('Sent to the indexing queue {0} loans'.format(len(rec_loans)))

    click.secho('Now indexing...', fg='green')
    # process queue so items can resolve circulation status correctly
    indexer.process_bulk_queue()

    # index items
    indexer.bulk_index([str(r.id) for r in rec_items])
    click.echo('Sent to the indexing queue {0} items'.format(len(rec_items)))

    click.secho('Now indexing...', fg='green')
    # process queue so documents can resolve circulation correctly
    indexer.process_bulk_queue()

    # sleep to give time for items to be indexed
    time.sleep(1)

    # index documents
    indexer.bulk_index([str(r.id) for r in rec_docs])
    click.echo('Sent to the indexing queue {0} documents'.format(
        len(rec_docs)))

    click.secho('Now indexing...', fg='green')
    indexer.process_bulk_queue()
Ejemplo n.º 17
0
def data(n_docs, n_items, n_eitems, n_loans, n_keywords, n_intlocs, n_series):
    """Insert demo data."""
    click.secho('Generating demo data', fg='yellow')

    indexer = RecordIndexer()

    holder = Holder(
        patrons_pids=["1", "2", "5", "6"],
        librarian_pid="4",
        total_intloc=n_intlocs,
        total_keywords=n_keywords,
        total_items=n_items,
        total_eitems=n_eitems,
        total_documents=n_docs,
        total_loans=n_loans,
        total_series=n_series,
    )

    click.echo('Creating locations...')
    loc_generator = LocationGenerator(holder, minter)
    loc_generator.generate()
    rec = loc_generator.persist()
    indexer.index(rec)

    # InternalLocations
    intlocs_generator = InternalLocationGenerator(holder, minter)
    intlocs_generator.generate()
    rec_intlocs = intlocs_generator.persist()

    # Keywords
    click.echo('Creating keywords...')
    keywords_generator = KeywordGenerator(holder, minter)
    keywords_generator.generate()
    rec_keywords = keywords_generator.persist()

    # Series
    click.echo('Creating series...')
    series_generator = SeriesGenerator(holder, minter)
    series_generator.generate()
    rec_series = series_generator.persist()

    # Documents
    click.echo('Creating documents...')
    documents_generator = DocumentGenerator(holder, minter)
    documents_generator.generate()
    rec_docs = documents_generator.persist()

    # Items
    click.echo('Creating items...')
    items_generator = ItemGenerator(holder, minter)
    items_generator.generate()
    rec_items = items_generator.persist()

    # EItems
    click.echo('Creating eitems...')
    eitems_generator = EItemGenerator(holder, minter)
    eitems_generator.generate()
    rec_eitems = eitems_generator.persist()

    # Loans
    click.echo('Creating loans...')
    loans_generator = LoanGenerator(holder, minter)
    loans_generator.generate()
    rec_loans = loans_generator.persist()

    # Related records
    click.echo('Creating related records...')
    related_generator = RelatedRecordsGenerator(holder, minter)
    related_generator.generate(rec_docs, rec_series)
    related_generator.persist()

    # index locations
    indexer.bulk_index([str(r.id) for r in rec_intlocs])
    click.echo('Sent to the indexing queue {0} locations'.format(
        len(rec_intlocs)))

    # index keywords
    indexer.bulk_index([str(r.id) for r in rec_keywords])
    click.echo('Sent to the indexing queue {0} keywords'.format(
        len(rec_keywords)))
    # process queue so series can resolve keywords correctly
    indexer.process_bulk_queue()

    # index series
    indexer.bulk_index([str(r.id) for r in rec_series])
    click.echo('Sent to the indexing queue {0} series'.format(len(rec_series)))

    # index loans
    indexer.bulk_index([str(r.id) for r in rec_loans])
    click.echo('Sent to the indexing queue {0} loans'.format(len(rec_loans)))

    click.secho('Now indexing...', fg='green')
    # process queue so items can resolve circulation status correctly
    indexer.process_bulk_queue()

    # index eitems
    indexer.bulk_index([str(r.id) for r in rec_eitems])
    click.echo('Sent to the indexing queue {0} eitems'.format(len(rec_eitems)))

    # index items
    indexer.bulk_index([str(r.id) for r in rec_items])
    click.echo('Sent to the indexing queue {0} items'.format(len(rec_items)))

    click.secho('Now indexing...', fg='green')
    # process queue so documents can resolve circulation correctly
    indexer.process_bulk_queue()

    # flush all indices after indexing, otherwise ES won't be ready for tests
    current_search.flush_and_refresh(index='*')

    # index documents
    indexer.bulk_index([str(r.id) for r in rec_docs])
    click.echo('Sent to the indexing queue {0} documents'.format(
        len(rec_docs)))

    click.secho('Now indexing...', fg='green')
    indexer.process_bulk_queue()
Ejemplo n.º 18
0
def data(n_docs, n_items, n_eitems, n_loans, n_intlocs, n_series,
         n_document_requests, n_vendors, n_orders, n_libraries,
         n_borrowing_requests):
    """Insert demo data."""
    click.secho("Generating demo data", fg="yellow")

    indexer = RecordIndexer()

    vocabulary_dir = os.path.join(os.path.realpath("."), "invenio_app_ils",
                                  "vocabularies", "data")

    with open(os.path.join(vocabulary_dir, "tags.json")) as f:
        tags = json.loads(f.read())

    with open(os.path.join(vocabulary_dir, "languages.json")) as f:
        languages = json.loads(f.read())

    holder = Holder(
        patrons_pids=["1", "2", "5", "6"],
        languages=languages,
        librarian_pid="4",
        tags=tags,
        total_intloc=n_intlocs,
        total_items=n_items,
        total_eitems=n_eitems,
        total_documents=n_docs,
        total_loans=n_loans,
        total_series=n_series,
        total_document_requests=n_document_requests,
        total_vendors=n_vendors,
        total_orders=n_orders,
        total_borrowing_requests=n_borrowing_requests,
        total_libraries=n_libraries,
    )

    click.echo("Creating locations...")
    loc_generator = LocationGenerator(holder, minter)
    loc_generator.generate()
    rec = loc_generator.persist()
    indexer.index(rec)

    # InternalLocations
    intlocs_generator = InternalLocationGenerator(holder, minter)
    intlocs_generator.generate()
    rec_intlocs = intlocs_generator.persist()

    # Series
    click.echo("Creating series...")
    series_generator = SeriesGenerator(holder, minter)
    series_generator.generate()
    rec_series = series_generator.persist()

    # Documents
    click.echo("Creating documents...")
    documents_generator = DocumentGenerator(holder, minter)
    documents_generator.generate()
    rec_docs = documents_generator.persist()

    # Items
    click.echo("Creating items...")
    items_generator = ItemGenerator(holder, minter)
    items_generator.generate()
    rec_items = items_generator.persist()

    # EItems
    click.echo("Creating eitems...")
    eitems_generator = EItemGenerator(holder, minter)
    eitems_generator.generate()
    rec_eitems = eitems_generator.persist()

    # Loans
    click.echo("Creating loans...")
    loans_generator = LoanGenerator(holder, minter)
    loans_generator.generate()
    rec_loans = loans_generator.persist()

    # Related records
    click.echo("Creating related records...")
    related_generator = RecordRelationsGenerator(holder, minter)
    related_generator.generate(rec_docs, rec_series)
    related_generator.persist()

    # Document requests
    click.echo("Creating document requests...")
    document_requests_generator = DocumentRequestGenerator(holder, minter)
    document_requests_generator.generate()
    rec_requests = document_requests_generator.persist()

    # Vendors
    click.echo("Creating vendors...")
    vendor_generator = VendorGenerator(holder, minter)
    vendor_generator.generate()
    rec_vendors = vendor_generator.persist()

    # Orders
    click.echo("Creating orders...")
    order_generator = OrderGenerator(holder, minter)
    order_generator.generate()
    rec_orders = order_generator.persist()

    # Libraries
    click.echo("Creating libraries...")
    library_generator = LibraryGenerator(holder, minter)
    library_generator.generate()
    rec_libraries = library_generator.persist()

    # Borrowing requests
    click.echo("Creating borrowing requests...")
    borrowing_requests_generator = BorrowingRequestGenerator(holder, minter)
    borrowing_requests_generator.generate()
    rec_borrowing_requests = borrowing_requests_generator.persist()

    # index locations
    indexer.bulk_index([str(r.id) for r in rec_intlocs])
    click.echo("Sent to the indexing queue {0} locations".format(
        len(rec_intlocs)))

    # index series
    indexer.bulk_index([str(r.id) for r in rec_series])
    click.echo("Sent to the indexing queue {0} series".format(len(rec_series)))

    # index loans
    indexer.bulk_index([str(r.id) for r in rec_loans])
    click.echo("Sent to the indexing queue {0} loans".format(len(rec_loans)))

    click.secho("Now indexing...", fg="green")
    # process queue so items can resolve circulation status correctly
    indexer.process_bulk_queue()

    # index eitems
    indexer.bulk_index([str(r.id) for r in rec_eitems])
    click.echo("Sent to the indexing queue {0} eitems".format(len(rec_eitems)))

    # index items
    indexer.bulk_index([str(r.id) for r in rec_items])
    click.echo("Sent to the indexing queue {0} items".format(len(rec_items)))

    click.secho("Now indexing...", fg="green")
    # process queue so documents can resolve circulation correctly
    indexer.process_bulk_queue()

    # index libraries
    indexer.bulk_index([str(r.id) for r in rec_libraries])
    click.echo("Sent to the indexing queue {0} libraries".format(
        len(rec_libraries)))

    # index borrowing requests
    indexer.bulk_index([str(r.id) for r in rec_borrowing_requests])
    click.echo("Sent to the indexing queue {0} borrowing requests".format(
        len(rec_borrowing_requests)))

    click.secho("Now indexing...", fg="green")
    indexer.process_bulk_queue()

    # flush all indices after indexing, otherwise ES won't be ready for tests
    current_search.flush_and_refresh(index="*")

    # index documents
    indexer.bulk_index([str(r.id) for r in rec_docs])
    click.echo("Sent to the indexing queue {0} documents".format(
        len(rec_docs)))

    # index document requests
    indexer.bulk_index([str(r.id) for r in rec_requests])
    click.echo("Sent to the indexing queue {0} document requests".format(
        len(rec_requests)))

    # index loans again
    indexer.bulk_index([str(r.id) for r in rec_loans])
    click.echo("Sent to the indexing queue {0} loans".format(len(rec_loans)))

    # index items again
    indexer.bulk_index([str(r.id) for r in rec_items])
    click.echo("Sent to the indexing queue {0} items".format(len(rec_items)))

    # index vendors
    indexer.bulk_index([str(r.id) for r in rec_vendors])
    click.echo("Sent to the indexing queue {0} vendors".format(
        len(rec_vendors)))

    # index orders
    indexer.bulk_index([str(r.id) for r in rec_orders])
    click.echo("Sent to the indexing queue {0} orders".format(len(rec_orders)))

    click.secho("Now indexing...", fg="green")
    indexer.process_bulk_queue()
Ejemplo n.º 19
0
def import_documents(institution, pages):
    """Import documents from RERO doc.

    institution: String institution filter for retreiving documents
    pages: Number of pages to import
    """
    url = current_app.config.get('SONAR_DOCUMENTS_RERO_DOC_URL')

    click.secho('Importing {pages} pages of records for "{institution}" '
                'from {url}'.format(pages=pages,
                                    institution=institution,
                                    url=url))

    # Get institution record from database
    institution_record = InstitutionRecord.get_record_by_pid(institution)

    if not institution_record:
        raise ClickException('Institution record not found in database')

    institution_ref_link = InstitutionRecord.get_ref_link(
        'institutions', institution_record['pid'])

    # mapping between institution key and RERO doc filter
    institution_map = current_app.config.get(
        'SONAR_DOCUMENTS_INSTITUTIONS_MAP')

    if not institution_map:
        raise ClickException('Institution map not found in configuration')

    if institution not in institution_map:
        raise ClickException(
            'Institution map for "{institution}" not found in configuration, '
            'keys available {keys}'.format(institution=institution,
                                           keys=institution_map.keys()))

    key = institution_map[institution]
    current_page = 1

    indexer = RecordIndexer()

    while (current_page <= pages):
        click.echo('Importing records {start} to {end}... '.format(
            start=(current_page * 10 - 9), end=(current_page * 10)),
                   nl=False)

        # Read Marc21 data for current page
        response = requests.get(
            '{url}?of=xm&jrec={first_record}&c=NAVSITE.{institution}'.format(
                url=url,
                first_record=(current_page * 10 - 9),
                institution=key.upper()),
            stream=True)

        if response.status_code != 200:
            raise ClickException('Request to "{url}" failed'.format(url=url))

        response.raw.decode_content = True

        ids = []

        for data in split_stream(response.raw):
            # Convert from Marc XML to JSON
            record = create_record(data)

            # Transform JSON
            record = marc21tojson.do(record)

            # Add institution
            record['institution'] = {'$ref': institution_ref_link}

            # Register record to DB
            db_record = DocumentRecord.create(record)
            db.session.commit()

            # Add ID for bulk index in elasticsearch
            ids.append(str(db_record.id))

        # index and process queue in elasticsearch
        indexer.bulk_index(ids)
        indexer.process_bulk_queue()

        current_page += 1

        click.secho('Done', fg='green', nl=True)

    click.secho('Finished', fg='green')
Ejemplo n.º 20
0
def import_records(records_to_import):
    """Import records in database and index them.

    Used as celery task. "ignore_result" flag means that we don't want to
    get the status and/or the result of the task, execution is faster.

    :param list records_to_import: List of records to import.
    :returns: List of IDs.
    """
    indexer = RecordIndexer()

    ids = []

    for data in records_to_import:
        try:
            files_data = data.pop('files', [])

            record = DocumentRecord.get_record_by_identifier(
                data.get('identifiedBy', []))

            if not record:
                record = DocumentRecord.create(data,
                                               dbcommit=False,
                                               with_bucket=True)
            else:
                record.update(data)

            for file_data in files_data:
                # Store url and key and remove it from dict to pass dict to
                # kwargs in add_file_from_url method
                url = file_data.pop('url')
                key = file_data.pop('key')

                try:
                    record.add_file_from_url(url, key, **file_data)
                except Exception as exception:
                    current_app.logger.warning(
                        'Error during import of file {file} of record '
                        '{record}: {error}'.format(
                            file=key,
                            error=exception,
                            record=record['identifiedBy']))

            # Merge record in database, at this time it's not saved into DB.
            record.commit()

            # Pushing record to database, not yet persisted into DB
            db.session.flush()

            # Add ID for bulk index in elasticsearch
            ids.append(str(record.id))

            current_app.logger.info(
                'Record with reference "{reference}" imported successfully'.
                format(reference=record['identifiedBy']))

        except Exception as exception:
            current_app.logger.error(
                'Error during importation of record {record}: {exception}'.
                format(record=data, exception=exception))

    # Commit and index records
    db.session.commit()
    indexer.bulk_index(ids)
    indexer.process_bulk_queue()

    return ids
Ejemplo n.º 21
0
def update_record_statistics(start_date=None, end_date=None):
    """Update "_stats" field of affected records."""
    start_date = dateutil_parse(start_date) if start_date else None
    end_date = dateutil_parse(end_date) if start_date else None
    aggr_configs = {}

    if not start_date and not end_date:
        start_date = datetime.utcnow()
        end_date = datetime.utcnow()

        for aggr_name in current_stats.enabled_aggregations:
            aggr_cfg = current_stats.aggregations[aggr_name]
            aggr = aggr_cfg.aggregator_class(
                name=aggr_cfg.name, **aggr_cfg.aggregator_config)

            if not Index(aggr.aggregation_alias, using=aggr.client).exists():
                if not Index(aggr.event_index, using=aggr.client).exists():
                    start_date = min(start_date, datetime.utcnow())
                else:
                    start_date = min(
                        start_date, aggr._get_oldest_event_timestamp())

            # Retrieve the last two bookmarks
            bookmarks = Search(
                using=aggr.client,
                index=aggr.aggregation_alias,
                doc_type=aggr.bookmark_doc_type
            )[0:2].sort({'date': {'order': 'desc'}}).execute()

            if len(bookmarks) >= 1:
                end_date = max(
                    end_date,
                    datetime.strptime(bookmarks[0].date, aggr.doc_id_suffix))
            if len(bookmarks) == 2:
                start_date = min(
                    start_date,
                    datetime.strptime(bookmarks[1].date, aggr.doc_id_suffix))

            aggr_configs[aggr.aggregation_alias] = aggr
    elif start_date and end_date:
        for aggr_name in current_stats.enabled_aggregations:
            aggr_cfg = current_stats.aggregations[aggr_name]
            aggr = aggr_cfg.aggregator_class(
                name=aggr_cfg.name, **aggr_cfg.aggregator_config)
            aggr_configs[aggr.aggregation_alias] = aggr
    else:
        return

    # Get conceptrecids for all the affected records between the two dates
    conceptrecids = set()
    for aggr_alias, aggr in aggr_configs.items():
        query = Search(
            using=aggr.client,
            index=aggr.aggregation_alias,
            doc_type=aggr.aggregation_doc_type,
        ).filter(
            'range', timestamp={
                'gte': start_date.replace(microsecond=0).isoformat() + '||/d',
                'lte': end_date.replace(microsecond=0).isoformat() + '||/d'}
        ).extra(_source=False)
        query.aggs.bucket('ids', 'terms', field='conceptrecid', size=0)
        conceptrecids |= {
            b.key for b in query.execute().aggregations.ids.buckets}

    indexer = RecordIndexer()
    for concpetrecid_val in conceptrecids:
        conceptrecid = PersistentIdentifier.get('recid', concpetrecid_val)
        pv = PIDVersioning(parent=conceptrecid)
        children_recids = pv.children.all()
        indexer.bulk_index([str(p.object_uuid) for p in children_recids])
Ejemplo n.º 22
0
def _index(iterator):
    """Bulk index the iterator."""
    indexer = RecordIndexer()
    indexer.bulk_index(iterator)
    indexer.process_bulk_queue()
Ejemplo n.º 23
0
def bulk_records(records):
    """Records creation."""
    n_updated = 0
    n_rejected = 0
    n_created = 0
    record_schema = current_jsonschemas.path_to_url('documents/document-v0.0.1.json')
    item_schema = current_jsonschemas.path_to_url('items/item-v0.0.1.json')
    holding_schema = current_jsonschemas.path_to_url('holdings/holding-v0.0.1.json')
    host_url = current_app.config.get('RERO_ILS_APP_BASE_URL')
    url_api = '{host}/api/{doc_type}/{pid}'
    record_id_iterator = []
    item_id_iterator = []
    holding_id_iterator = []
    indexer = RecordIndexer()
    start_time = datetime.now()
    for record in records:
        try:
            if record.get('frbr', False):
                document = record.get('document', {})
                """
                # check if already in Rero-ILS
                pid = None

                for identifier in document.get('identifiedBy') :
                    if identifier.get('source') == 'VIRTUA' :
                        bibid = identifier.get('value')
                        query = DocumentsSearch().filter(
                            'term',
                            identifiedBy__value=bibid
                        ).source(includes=['pid'])
                        try:
                            pid = [r.pid for r in query.scan()].pop()
                        except IndexError:
                            pid = None
                if pid:
                    # update the record
                    # Do nothing for the moment
                    continue
                else:
                    """
                document['$schema'] = record_schema

                created_time = datetime.now()
                document = Document.create(
                    document,
                    dbcommit=False,
                    reindex=False
                )
                
                record_id_iterator.append(document.id)

                uri_documents = url_api.format(host=host_url,
                                               doc_type='documents',
                                               pid=document.pid)
                
                map_holdings = {}
                for holding in record.get('holdings'):
                    holding['$schema'] = holding_schema
                    holding['document'] = {
                        '$ref': uri_documents
                        }
                    holding['circulation_category'] = {
                        '$ref': map_item_type(str(holding.get('circulation_category')))
                        }
                    holding['location'] = {
                        '$ref': map_locations(str(holding.get('location')))
                        }
                    
                    created_time = datetime.now()

                    result = Holding.create(
                        holding,
                        dbcommit=False,
                        reindex=False
                    )
                    
                    map_holdings.update({
                            '{location}#{cica}'.format(
                                location = holding.get('location'),
                                cica = holding.get('circulation_category')) : result.get('pid')
                        }
                    )
                    
                    holding_id_iterator.append(result.id)
                
                for item in record.get('items'):
                    item['$schema'] = item_schema
                    item['document'] = {
                        '$ref': uri_documents
                        }
                    item['item_type'] = {
                        '$ref': map_item_type(str(item.get('item_type')))
                        }
                    item['location'] = {
                        '$ref': map_locations(str(item.get('location')))
                        }

                    holding_pid = map_holdings.get(
                        '{location}#{cica}'.format(
                            location = item.get('location'),
                            cica = item.get('item_type')))

                    item['holding'] = {
                        '$ref': url_api.format(host=host_url,
                                    doc_type='holdings',
                                    pid=holding_pid)
                        }
                    
                    result = Item.create(
                        item,
                        dbcommit=False,
                        reindex=False
                    )

                    item_id_iterator.append(result.id)

                n_created += 1
            if n_created % 1000 == 0:
                execution_time = datetime.now() - start_time
                click.secho('{nb} created records in {execution_time}.'
                            .format(nb=len(record_id_iterator),
                                    execution_time=execution_time),
                            fg='white')
                start_time = datetime.now()

                db.session.commit()
                execution_time = datetime.now() - start_time
                click.secho('{nb} commited records in {execution_time}.'
                            .format(nb=len(record_id_iterator),
                                    execution_time=execution_time),
                            fg='white')
                start_time = datetime.now()
                click.secho('sending {n} holdings to indexer queue.'
                            .format(n=len(holding_id_iterator)), fg='white')
                indexer.bulk_index(holding_id_iterator)
                click.secho('process queue...', fg='yellow')
                indexer.process_bulk_queue()
                click.secho('sending {n} items to indexer queue.'
                            .format(n=len(item_id_iterator)), fg='white')
                indexer.bulk_index(item_id_iterator)
                click.secho('process queue...', fg='yellow')
                indexer.process_bulk_queue()
                click.secho('sending {n} documents to indexer queue.'
                            .format(n=len(record_id_iterator)), fg='white')
                indexer.bulk_index(record_id_iterator)
                click.secho('process queue...', fg='yellow')
                indexer.process_bulk_queue()
                execution_time = datetime.now() - start_time
                click.secho('indexing records process in {execution_time}.'
                            .format(execution_time=execution_time),
                            fg='white')
                click.secho('processing next batch records.', fg='green')

                record_id_iterator.clear()
                holding_id_iterator.clear()
                item_id_iterator.clear()
                start_time = datetime.now()
        except Exception as e:
            n_rejected += 1
            click.secho('Error processing record [{id}] : {e}'
                        .format(id=record.get('_id'), e=e), fg='red')
    db.session.commit()
    indexer.bulk_index(holding_id_iterator)
    indexer.process_bulk_queue()
    indexer.bulk_index(item_id_iterator)
    indexer.process_bulk_queue()
    indexer.bulk_index(record_id_iterator)
    indexer.process_bulk_queue()
    return n_created