Beispiel #1
0
def es(appctx):
    """Setup and teardown all registered Elasticsearch indices.

    Scope: module
    This fixture will create all registered indexes in Elasticsearch and remove
    once done. Fixtures that perform changes (e.g. index or remove documents),
    should used the function-scoped :py:data:`es_clear` fixture to leave the
    indexes clean for the following tests.
    """
    from invenio_search import current_search, current_search_client
    from invenio_search.errors import IndexAlreadyExistsError

    try:
        list(current_search.put_templates())
    except IndexAlreadyExistsError:
        current_search_client.indices.delete_template('*')
        list(current_search.put_templates())

    try:
        list(current_search.create())
    except IndexAlreadyExistsError:
        list(current_search.delete(ignore=[404]))
        list(current_search.create())
    current_search_client.indices.refresh()

    try:
        yield current_search_client
    finally:
        current_search_client.indices.delete(index='*')
        current_search_client.indices.delete_template('*')
Beispiel #2
0
def aggregated_events(app, es, mock_user_ctx, request):
    """Parametrized pre indexed sample events."""
    for t in current_search.put_templates(ignore=[400]):
        pass
    generate_events(app=app, **request.param)
    aggregate_events(['file-download-agg'])
    current_search.flush_and_refresh(index='*')
    yield
Beispiel #3
0
def generate_events(app, file_number=5, event_number=100, robot_event_number=0,
                    start_date=datetime.date(2017, 1, 1),
                    end_date=datetime.date(2017, 1, 7)):
    """Queued events for processing tests."""
    current_queues.declare()

    for t in current_search.put_templates(ignore=[400]):
        pass

    def _unique_ts_gen():
        ts = 0
        while True:
            ts += 1
            yield ts

    def generator_list():
        unique_ts = _unique_ts_gen()
        for file_idx in range(file_number):
            for entry_date in date_range(start_date, end_date):
                file_id = 'F000000000000000000000000000000{}'.\
                    format(file_idx + 1)
                bucket_id = 'B000000000000000000000000000000{}'.\
                    format(file_idx + 1)

                def build_event(is_robot=False):
                    ts = next(unique_ts)
                    return dict(
                        timestamp=datetime.datetime.combine(
                            entry_date,
                            datetime.time(minute=ts % 60,
                                          second=ts % 60)).
                        isoformat(),
                        bucket_id=bucket_id,
                        file_id=file_id,
                        file_key='test.pdf',
                        size=9000,
                        visitor_id=100,
                        is_robot=is_robot
                    )

                for event_idx in range(event_number):
                    yield build_event()
                for event_idx in range(robot_event_number):
                    yield build_event(True)

    mock_queue = Mock()
    mock_queue.consume.return_value = generator_list()
    mock_queue.routing_key = 'stats-file-download'

    EventsIndexer(
        mock_queue,
        preprocessors=[
            build_file_unique_id
        ],
        double_click_window=0
    ).run()
    current_search_client.indices.refresh(index='*')
Beispiel #4
0
def elasticsearch_index_init(alembic, verbose):
    """Initialize the elasticsearch indices and indexing queue."""
    for _ in current_search.create(ignore=[400]):
        pass
    for _ in current_search.put_templates(ignore=[400]):
        pass
    queue = current_app.config['INDEXER_MQ_QUEUE']
    with establish_connection() as c:
        q = queue(c)
        q.declare()
Beispiel #5
0
def elasticsearch_index_init(alembic, verbose):
    """Initialize the elasticsearch indices and indexing queue."""
    for _ in current_search.create(ignore=[400]):
        pass
    for _ in current_search.put_templates(ignore=[400]):
        pass
    queue = current_app.config['INDEXER_MQ_QUEUE']
    with establish_connection() as c:
        q = queue(c)
        q.declare()
Beispiel #6
0
def test_get_bookmark(app, indexed_events):
    """Test bookmark reading."""
    for t in current_search.put_templates(ignore=[400]):
        pass
    stat_agg = StatAggregator(name='file-download-agg',
                              client=current_search_client,
                              event='file-download',
                              aggregation_field='file_id',
                              aggregation_interval='day')
    stat_agg.run()
    assert stat_agg.get_bookmark() == datetime.datetime(2017, 1, 8)
Beispiel #7
0
def es(app):
    """Provide elasticsearch access."""
    list(current_search.delete(ignore=[400, 404]))
    current_search_client.indices.delete(index='*')
    current_search_client.indices.delete_template('*')
    list(current_search.create())
    list(current_search.put_templates())
    current_search_client.indices.refresh()
    try:
        yield current_search_client
    finally:
        current_search_client.indices.delete(index='*')
        current_search_client.indices.delete_template('*')
Beispiel #8
0
def es(app):
    """Provide elasticsearch access."""
    list(current_search.delete(ignore=[400, 404]))
    current_search_client.indices.delete(index='*')
    current_search_client.indices.delete_template('*')
    list(current_search.create())
    list(current_search.put_templates())
    current_search_client.indices.refresh()
    try:
        yield current_search_client
    finally:
        current_search_client.indices.delete(index='*')
        current_search_client.indices.delete_template('*')
Beispiel #9
0
def es(app):
    """Provide elasticsearch access, create and clean indices.

    Don't create template so that the test or another fixture can modify the
    enabled events.
    """
    current_search_client.indices.delete(index='*')
    current_search_client.indices.delete_template('*')
    list(current_search.create())
    list(current_search.put_templates())
    try:
        yield current_search_client
    finally:
        current_search_client.indices.delete(index='*')
        current_search_client.indices.delete_template('*')
Beispiel #10
0
def generate_events(app, file_number=5, event_number=100, robot_event_number=0,
                    start_date=datetime.date(2017, 1, 1),
                    end_date=datetime.date(2017, 1, 7)):
    """Queued events for processing tests."""
    current_queues.declare()

    for t in current_search.put_templates(ignore=[400]):
        pass

    def generator_list():
        for file_idx in range(file_number):
            for entry_date in date_range(start_date, end_date):
                entry_date = datetime.datetime.combine(
                    entry_date, datetime.time())
                file_id = '{0}-{1}'.format(entry_date.strftime('%Y-%m-%d'),
                                           file_idx)

                def build_event(is_robot=False):
                    return dict(
                        timestamp=entry_date.isoformat(),
                        bucket_id=file_id,
                        file_id=file_id,
                        file_key='test.pdf',
                        visitor_id=100,
                        is_robot=is_robot
                    )

                for event_idx in range(event_number):
                    yield build_event()
                for event_idx in range(robot_event_number):
                    yield build_event(True)

    mock_queue = Mock()
    mock_queue.consume.return_value = generator_list()
    mock_queue.routing_key = 'stats-file-download'

    EventsIndexer(
        mock_queue,
        preprocessors=[
            build_file_unique_id
        ]
    ).run()
    current_search_client.indices.flush(index='*')
Beispiel #11
0
def generate_events(app,
                    file_number=5,
                    event_number=100,
                    robot_event_number=0,
                    start_date=datetime.date(2017, 1, 1),
                    end_date=datetime.date(2017, 1, 7)):
    """Queued events for processing tests."""
    current_queues.declare()

    for t in current_search.put_templates(ignore=[400]):
        pass

    def generator_list():
        for file_idx in range(file_number):
            for entry_date in date_range(start_date, end_date):
                entry_date = datetime.datetime.combine(entry_date,
                                                       datetime.time())
                file_id = '{0}-{1}'.format(entry_date.strftime('%Y-%m-%d'),
                                           file_idx)

                def build_event(is_robot=False):
                    return dict(timestamp=entry_date.isoformat(),
                                bucket_id=file_id,
                                file_id=file_id,
                                file_key='test.pdf',
                                visitor_id=100,
                                is_robot=is_robot)

                for event_idx in range(event_number):
                    yield build_event()
                for event_idx in range(robot_event_number):
                    yield build_event(True)

    mock_queue = Mock()
    mock_queue.consume.return_value = generator_list()
    mock_queue.routing_key = 'stats-file-download'

    EventsIndexer(mock_queue, preprocessors=[build_file_unique_id]).run()
    current_search_client.indices.flush(index='*')
def test_overwriting_aggregations(app, es, event_queues, sequential_ids):
    """Check that the StatAggregator correctly starts from bookmark.

    1. Create sample file download event and process it.
    2. Run aggregator and write count, in aggregation index.
    3. Create new events and repeat procedure to assert that the
        results within the interval of the previous events
        overwrite the aggregation,
        by checking that the document version has increased.
    """
    for t in current_search.put_templates(ignore=[400]):
        pass

    class NewDate(datetime.datetime):
        """datetime.datetime mock."""
        # Aggregate at 12:00, thus the day will be aggregated again later
        current_date = (2017, 6, 2, 12)

        @classmethod
        def utcnow(cls):
            return cls(*cls.current_date)

    # Send some events
    event_type = 'file-download'
    events = [_create_file_download_event(date) for date in
              [(2017, 6, 1), (2017, 6, 2, 10)]]
    current_queues.declare()
    current_stats.publish(event_type, events)
    process_events(['file-download'])
    current_search_client.indices.flush(index='*')
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])

    # Send new events, some on the last aggregated day and some far
    # in the future.
    res = current_search_client.search(index='stats-file-download',
                                       version=True)
    for hit in res['hits']['hits']:
        if 'file_id' in hit['_source'].keys():
            assert hit['_version'] == 1

    new_events = [_create_file_download_event(date) for date in
                  [(2017, 6, 2, 15),  # second event on the same date
                   (2017, 7, 1)]]
    current_stats.publish(event_type, new_events)
    process_events(['file-download'])
    current_search_client.indices.flush(index='*')

    # Aggregate again. The aggregation should start from the last bookmark.
    NewDate.current_date = (2017, 7, 2)
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])
    current_search_client.indices.flush(index='*')

    res = current_search_client.search(
        index='stats-file-download',
        doc_type='file-download-day-aggregation',
        version=True
    )
    for hit in res['hits']['hits']:
        if hit['_source']['timestamp'] == '2017-06-02T00:00:00':
            assert hit['_version'] == 2
            assert hit['_source']['count'] == 2
        else:
            assert hit['_version'] == 1
Beispiel #13
0
def indexed_events(app, es, mock_user_ctx, request):
    """Parametrized pre indexed sample events."""
    for t in current_search.put_templates(ignore=[400]):
        pass
    generate_events(app=app, **request.param)
    yield
Beispiel #14
0
def es_with_templates(app, es):
    """Provide elasticsearch access, create and clean indices and templates."""
    list(current_search.put_templates())
    yield current_search_client
Beispiel #15
0
def test_file_download_statistics(app, test_community, test_users,
                                  test_records, login_user):
    """Test checking a record's DOI using CLI commands."""
    with app.app_context():

        def url_for(*args, **kwargs):
            """Generate url using flask.url_for and the current app ctx."""
            with app.app_context():
                return flask_url_for(*args, **kwargs)

        # create user that will create the record and the files
        scopes = current_oauth2server.scope_choices()

        allowed_user = create_user('allowed')

        scopes = current_oauth2server.scope_choices()
        allowed_token = Token.create_personal('allowed_token',
                                              allowed_user.id,
                                              scopes=[s[0] for s in scopes])
        # application authentication token header
        allowed_headers = [('Authorization',
                            'Bearer {}'.format(allowed_token.access_token))]

        community_name = 'MyTestCommunity1'
        community = Community.get(name=community_name)
        com_admin = create_user('com_admin2', roles=[community.admin_role])
        com_admin_token = Token.create_personal('com_admin_token',
                                                com_admin.id,
                                                scopes=[s[0] for s in scopes])
        # application authentication token header
        com_admin_headers = [
            ('Authorization',
             'Bearer {}'.format(com_admin_token.access_token)),
            ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) '
             'AppleWebKit/537.36 (KHTML, like Gecko)'
             'Chrome/45.0.2454.101 Safari/537.36')
        ]
        publish_headers = [('Content-Type', 'application/json-patch+json'),
                           ('Accept', 'application/json')] + com_admin_headers
        submit_headers = [('Content-Type', 'application/json-patch+json'),
                          ('Accept', 'application/json')] + allowed_headers
        stats_headers = [('Content-Type', 'application/json')]

        test_records_data = [
            generate_record_data(community=test_community.name)
            for idx in range(1, 3)
        ]

        for record_data in test_records_data:
            with app.test_client() as client:
                login_user(allowed_user, client)

                record_list_url = (lambda **kwargs: url_for(
                    'b2share_records_rest.b2rec_list', **kwargs))

                headers = [('Content-Type', 'application/json'),
                           ('Accept', 'application/json')] + allowed_headers
                draft_create_res = client.post(record_list_url(),
                                               data=json.dumps(record_data),
                                               headers=headers)
                assert draft_create_res.status_code == 201
                draft_create_data = json.loads(
                    draft_create_res.get_data(as_text=True))

                uploaded_files = {
                    'myfile1.html': b'contents1',
                    'myfile2.html': b'contents2'
                }

                for file_key, file_content in uploaded_files.items():
                    # Test file upload
                    headers = [('Accept', '*/*'),
                               ('Content-Type', 'text/html; charset=utf-8')
                               ] + allowed_headers
                    object_url = '{0}/{1}'.format(
                        draft_create_data['links']['files'], file_key)
                    file_put_res = client.put(
                        object_url,
                        input_stream=BytesIO(file_content),
                        headers=headers)
                    assert file_put_res.status_code == 200
                    file_put_data = json.loads(
                        file_put_res.get_data(as_text=True))
                    assert 'created' in file_put_data

                    bucket_id = draft_create_data['links']['files'].split(
                        '/')[-1]
                    # make sure that downloads from deposits are skipped
                    client.get(
                        url_for('invenio_files_rest.object_api',
                                bucket_id=bucket_id,
                                key=file_key))
                    assert process_events(['file-download']) == \
                        [('file-download', (0, 0))]

                # test draft submit
                draft_submit_res = client.patch(
                    url_for('b2share_deposit_rest.b2dep_item',
                            pid_value=draft_create_data['id']),
                    data=json.dumps([{
                        "op": "replace",
                        "path": "/publication_state",
                        "value": PublicationStates.submitted.name
                    }]),
                    headers=submit_headers)
                assert draft_submit_res.status_code == 200

            with app.test_client() as client:
                login_user(com_admin, client)
                # test draft publish
                draft_publish_res = client.patch(
                    url_for('b2share_deposit_rest.b2dep_item',
                            pid_value=draft_create_data['id']),
                    data=json.dumps([{
                        "op": "replace",
                        "path": "/publication_state",
                        "value": PublicationStates.published.name
                    }]),
                    headers=publish_headers)

                assert draft_publish_res.status_code == 200
                draft_publish_data = json.loads(
                    draft_publish_res.get_data(as_text=True))

                # Test record GET
                record_get_res = client.get(url_for(
                    'b2share_records_rest.b2rec_item',
                    pid_value=draft_publish_data['id']),
                                            headers=headers)
                assert record_get_res.status_code == 200
                record_get_data = json.loads(
                    record_get_res.get_data(as_text=True))

                # make sure that templates are in the ES
                list(current_search.put_templates())

                # test that a record is accessible through the rest api
                file1 = record_get_data['files'][0]

                # download once
                client.get(url_for('invenio_files_rest.object_api',
                                   bucket_id=file1['bucket'],
                                   key=file1['key']),
                           headers=com_admin_headers)
                # make sure that the queue contains the event
                assert list(
                    current_queues.queues['stats-file-download'].consume())

                # download again
                client.get(url_for('invenio_files_rest.object_api',
                                   bucket_id=file1['bucket'],
                                   key=file1['key']),
                           headers=com_admin_headers)

                process_events(['file-download'])
                current_search_client.indices.refresh('*')
                # make sure that new index for events is created in ES
                current_search_client.indices.exists(
                    index='events-stats-file-download')

                aggregate_events(['file-download-agg'])
                current_search_client.indices.refresh('*')

                # make sure that new aggregation index is created in ES
                current_search_client.indices.exists(
                    index='stats-file-download')

                stats_ret = client.post(url_for('invenio_stats.stat_query'),
                                        data=json.dumps({
                                            'mystat': {
                                                'stat':
                                                'bucket-file-download-total',
                                                'params': {
                                                    'start_date': '2017-01-01',
                                                    'bucket_id':
                                                    file1['bucket'],
                                                }
                                            }
                                        }),
                                        headers=stats_headers)
                stats_ret_data = json.loads(stats_ret.get_data(as_text=True))
                assert stats_ret_data['mystat']['buckets'][0]['value'] == 1.0
Beispiel #16
0
def indexed_events(app, es, mock_user_ctx, request):
    """Parametrized pre indexed sample events."""
    for t in current_search.put_templates(ignore=[400]):
        pass
    generate_events(app=app, **request.param)
    yield
Beispiel #17
0
def test_overwriting_aggregations(app, es, event_queues, sequential_ids):
    """Check that the StatAggregator correctly starts from bookmark.

    1. Create sample file download event and process it.
    2. Run aggregator and write count, in aggregation index.
    3. Create new events and repeat procedure to assert that the
        results within the interval of the previous events
        overwrite the aggregation,
        by checking that the document version has increased.
    """
    for t in current_search.put_templates(ignore=[400]):
        pass

    class NewDate(datetime.datetime):
        """datetime.datetime mock."""
        # Aggregate at 12:00, thus the day will be aggregated again later
        current_date = (2017, 6, 2, 12)

        @classmethod
        def utcnow(cls):
            return cls(*cls.current_date)

    # Send some events
    event_type = 'file-download'
    events = [
        _create_file_download_event(date)
        for date in [(2017, 6, 1), (2017, 6, 2, 10)]
    ]
    current_queues.declare()
    current_stats.publish(event_type, events)
    process_events(['file-download'])
    current_search_client.indices.flush(index='*')
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])

    # Send new events, some on the last aggregated day and some far
    # in the future.
    res = current_search_client.search(index='stats-file-download',
                                       version=True)
    for hit in res['hits']['hits']:
        if 'file_id' in hit['_source'].keys():
            assert hit['_version'] == 1

    new_events = [
        _create_file_download_event(date) for date in [
            (2017, 6, 2, 15),  # second event on the same date
            (2017, 7, 1)
        ]
    ]
    current_stats.publish(event_type, new_events)
    process_events(['file-download'])
    current_search_client.indices.flush(index='*')

    # Aggregate again. The aggregation should start from the last bookmark.
    NewDate.current_date = (2017, 7, 2)
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])
    current_search_client.indices.flush(index='*')

    res = current_search_client.search(
        index='stats-file-download',
        doc_type='file-download-day-aggregation',
        version=True)
    for hit in res['hits']['hits']:
        if hit['_source']['timestamp'] == '2017-06-02T00:00:00':
            assert hit['_version'] == 2
            assert hit['_source']['count'] == 2
        else:
            assert hit['_version'] == 1