def es(appctx): """Setup and teardown all registered Elasticsearch indices. Scope: module This fixture will create all registered indexes in Elasticsearch and remove once done. Fixtures that perform changes (e.g. index or remove documents), should used the function-scoped :py:data:`es_clear` fixture to leave the indexes clean for the following tests. """ from invenio_search import current_search, current_search_client from invenio_search.errors import IndexAlreadyExistsError try: list(current_search.put_templates()) except IndexAlreadyExistsError: current_search_client.indices.delete_template('*') list(current_search.put_templates()) try: list(current_search.create()) except IndexAlreadyExistsError: list(current_search.delete(ignore=[404])) list(current_search.create()) current_search_client.indices.refresh() try: yield current_search_client finally: current_search_client.indices.delete(index='*') current_search_client.indices.delete_template('*')
def aggregated_events(app, es, mock_user_ctx, request): """Parametrized pre indexed sample events.""" for t in current_search.put_templates(ignore=[400]): pass generate_events(app=app, **request.param) aggregate_events(['file-download-agg']) current_search.flush_and_refresh(index='*') yield
def generate_events(app, file_number=5, event_number=100, robot_event_number=0, start_date=datetime.date(2017, 1, 1), end_date=datetime.date(2017, 1, 7)): """Queued events for processing tests.""" current_queues.declare() for t in current_search.put_templates(ignore=[400]): pass def _unique_ts_gen(): ts = 0 while True: ts += 1 yield ts def generator_list(): unique_ts = _unique_ts_gen() for file_idx in range(file_number): for entry_date in date_range(start_date, end_date): file_id = 'F000000000000000000000000000000{}'.\ format(file_idx + 1) bucket_id = 'B000000000000000000000000000000{}'.\ format(file_idx + 1) def build_event(is_robot=False): ts = next(unique_ts) return dict( timestamp=datetime.datetime.combine( entry_date, datetime.time(minute=ts % 60, second=ts % 60)). isoformat(), bucket_id=bucket_id, file_id=file_id, file_key='test.pdf', size=9000, visitor_id=100, is_robot=is_robot ) for event_idx in range(event_number): yield build_event() for event_idx in range(robot_event_number): yield build_event(True) mock_queue = Mock() mock_queue.consume.return_value = generator_list() mock_queue.routing_key = 'stats-file-download' EventsIndexer( mock_queue, preprocessors=[ build_file_unique_id ], double_click_window=0 ).run() current_search_client.indices.refresh(index='*')
def elasticsearch_index_init(alembic, verbose): """Initialize the elasticsearch indices and indexing queue.""" for _ in current_search.create(ignore=[400]): pass for _ in current_search.put_templates(ignore=[400]): pass queue = current_app.config['INDEXER_MQ_QUEUE'] with establish_connection() as c: q = queue(c) q.declare()
def test_get_bookmark(app, indexed_events): """Test bookmark reading.""" for t in current_search.put_templates(ignore=[400]): pass stat_agg = StatAggregator(name='file-download-agg', client=current_search_client, event='file-download', aggregation_field='file_id', aggregation_interval='day') stat_agg.run() assert stat_agg.get_bookmark() == datetime.datetime(2017, 1, 8)
def es(app): """Provide elasticsearch access.""" list(current_search.delete(ignore=[400, 404])) current_search_client.indices.delete(index='*') current_search_client.indices.delete_template('*') list(current_search.create()) list(current_search.put_templates()) current_search_client.indices.refresh() try: yield current_search_client finally: current_search_client.indices.delete(index='*') current_search_client.indices.delete_template('*')
def es(app): """Provide elasticsearch access, create and clean indices. Don't create template so that the test or another fixture can modify the enabled events. """ current_search_client.indices.delete(index='*') current_search_client.indices.delete_template('*') list(current_search.create()) list(current_search.put_templates()) try: yield current_search_client finally: current_search_client.indices.delete(index='*') current_search_client.indices.delete_template('*')
def generate_events(app, file_number=5, event_number=100, robot_event_number=0, start_date=datetime.date(2017, 1, 1), end_date=datetime.date(2017, 1, 7)): """Queued events for processing tests.""" current_queues.declare() for t in current_search.put_templates(ignore=[400]): pass def generator_list(): for file_idx in range(file_number): for entry_date in date_range(start_date, end_date): entry_date = datetime.datetime.combine( entry_date, datetime.time()) file_id = '{0}-{1}'.format(entry_date.strftime('%Y-%m-%d'), file_idx) def build_event(is_robot=False): return dict( timestamp=entry_date.isoformat(), bucket_id=file_id, file_id=file_id, file_key='test.pdf', visitor_id=100, is_robot=is_robot ) for event_idx in range(event_number): yield build_event() for event_idx in range(robot_event_number): yield build_event(True) mock_queue = Mock() mock_queue.consume.return_value = generator_list() mock_queue.routing_key = 'stats-file-download' EventsIndexer( mock_queue, preprocessors=[ build_file_unique_id ] ).run() current_search_client.indices.flush(index='*')
def generate_events(app, file_number=5, event_number=100, robot_event_number=0, start_date=datetime.date(2017, 1, 1), end_date=datetime.date(2017, 1, 7)): """Queued events for processing tests.""" current_queues.declare() for t in current_search.put_templates(ignore=[400]): pass def generator_list(): for file_idx in range(file_number): for entry_date in date_range(start_date, end_date): entry_date = datetime.datetime.combine(entry_date, datetime.time()) file_id = '{0}-{1}'.format(entry_date.strftime('%Y-%m-%d'), file_idx) def build_event(is_robot=False): return dict(timestamp=entry_date.isoformat(), bucket_id=file_id, file_id=file_id, file_key='test.pdf', visitor_id=100, is_robot=is_robot) for event_idx in range(event_number): yield build_event() for event_idx in range(robot_event_number): yield build_event(True) mock_queue = Mock() mock_queue.consume.return_value = generator_list() mock_queue.routing_key = 'stats-file-download' EventsIndexer(mock_queue, preprocessors=[build_file_unique_id]).run() current_search_client.indices.flush(index='*')
def test_overwriting_aggregations(app, es, event_queues, sequential_ids): """Check that the StatAggregator correctly starts from bookmark. 1. Create sample file download event and process it. 2. Run aggregator and write count, in aggregation index. 3. Create new events and repeat procedure to assert that the results within the interval of the previous events overwrite the aggregation, by checking that the document version has increased. """ for t in current_search.put_templates(ignore=[400]): pass class NewDate(datetime.datetime): """datetime.datetime mock.""" # Aggregate at 12:00, thus the day will be aggregated again later current_date = (2017, 6, 2, 12) @classmethod def utcnow(cls): return cls(*cls.current_date) # Send some events event_type = 'file-download' events = [_create_file_download_event(date) for date in [(2017, 6, 1), (2017, 6, 2, 10)]] current_queues.declare() current_stats.publish(event_type, events) process_events(['file-download']) current_search_client.indices.flush(index='*') with patch('datetime.datetime', NewDate): aggregate_events(['file-download-agg']) # Send new events, some on the last aggregated day and some far # in the future. res = current_search_client.search(index='stats-file-download', version=True) for hit in res['hits']['hits']: if 'file_id' in hit['_source'].keys(): assert hit['_version'] == 1 new_events = [_create_file_download_event(date) for date in [(2017, 6, 2, 15), # second event on the same date (2017, 7, 1)]] current_stats.publish(event_type, new_events) process_events(['file-download']) current_search_client.indices.flush(index='*') # Aggregate again. The aggregation should start from the last bookmark. NewDate.current_date = (2017, 7, 2) with patch('datetime.datetime', NewDate): aggregate_events(['file-download-agg']) current_search_client.indices.flush(index='*') res = current_search_client.search( index='stats-file-download', doc_type='file-download-day-aggregation', version=True ) for hit in res['hits']['hits']: if hit['_source']['timestamp'] == '2017-06-02T00:00:00': assert hit['_version'] == 2 assert hit['_source']['count'] == 2 else: assert hit['_version'] == 1
def indexed_events(app, es, mock_user_ctx, request): """Parametrized pre indexed sample events.""" for t in current_search.put_templates(ignore=[400]): pass generate_events(app=app, **request.param) yield
def es_with_templates(app, es): """Provide elasticsearch access, create and clean indices and templates.""" list(current_search.put_templates()) yield current_search_client
def test_file_download_statistics(app, test_community, test_users, test_records, login_user): """Test checking a record's DOI using CLI commands.""" with app.app_context(): def url_for(*args, **kwargs): """Generate url using flask.url_for and the current app ctx.""" with app.app_context(): return flask_url_for(*args, **kwargs) # create user that will create the record and the files scopes = current_oauth2server.scope_choices() allowed_user = create_user('allowed') scopes = current_oauth2server.scope_choices() allowed_token = Token.create_personal('allowed_token', allowed_user.id, scopes=[s[0] for s in scopes]) # application authentication token header allowed_headers = [('Authorization', 'Bearer {}'.format(allowed_token.access_token))] community_name = 'MyTestCommunity1' community = Community.get(name=community_name) com_admin = create_user('com_admin2', roles=[community.admin_role]) com_admin_token = Token.create_personal('com_admin_token', com_admin.id, scopes=[s[0] for s in scopes]) # application authentication token header com_admin_headers = [ ('Authorization', 'Bearer {}'.format(com_admin_token.access_token)), ('User-Agent', 'Mozilla/5.0 (Windows NT 6.1; WOW64) ' 'AppleWebKit/537.36 (KHTML, like Gecko)' 'Chrome/45.0.2454.101 Safari/537.36') ] publish_headers = [('Content-Type', 'application/json-patch+json'), ('Accept', 'application/json')] + com_admin_headers submit_headers = [('Content-Type', 'application/json-patch+json'), ('Accept', 'application/json')] + allowed_headers stats_headers = [('Content-Type', 'application/json')] test_records_data = [ generate_record_data(community=test_community.name) for idx in range(1, 3) ] for record_data in test_records_data: with app.test_client() as client: login_user(allowed_user, client) record_list_url = (lambda **kwargs: url_for( 'b2share_records_rest.b2rec_list', **kwargs)) headers = [('Content-Type', 'application/json'), ('Accept', 'application/json')] + allowed_headers draft_create_res = client.post(record_list_url(), data=json.dumps(record_data), headers=headers) assert draft_create_res.status_code == 201 draft_create_data = json.loads( draft_create_res.get_data(as_text=True)) uploaded_files = { 'myfile1.html': b'contents1', 'myfile2.html': b'contents2' } for file_key, file_content in uploaded_files.items(): # Test file upload headers = [('Accept', '*/*'), ('Content-Type', 'text/html; charset=utf-8') ] + allowed_headers object_url = '{0}/{1}'.format( draft_create_data['links']['files'], file_key) file_put_res = client.put( object_url, input_stream=BytesIO(file_content), headers=headers) assert file_put_res.status_code == 200 file_put_data = json.loads( file_put_res.get_data(as_text=True)) assert 'created' in file_put_data bucket_id = draft_create_data['links']['files'].split( '/')[-1] # make sure that downloads from deposits are skipped client.get( url_for('invenio_files_rest.object_api', bucket_id=bucket_id, key=file_key)) assert process_events(['file-download']) == \ [('file-download', (0, 0))] # test draft submit draft_submit_res = client.patch( url_for('b2share_deposit_rest.b2dep_item', pid_value=draft_create_data['id']), data=json.dumps([{ "op": "replace", "path": "/publication_state", "value": PublicationStates.submitted.name }]), headers=submit_headers) assert draft_submit_res.status_code == 200 with app.test_client() as client: login_user(com_admin, client) # test draft publish draft_publish_res = client.patch( url_for('b2share_deposit_rest.b2dep_item', pid_value=draft_create_data['id']), data=json.dumps([{ "op": "replace", "path": "/publication_state", "value": PublicationStates.published.name }]), headers=publish_headers) assert draft_publish_res.status_code == 200 draft_publish_data = json.loads( draft_publish_res.get_data(as_text=True)) # Test record GET record_get_res = client.get(url_for( 'b2share_records_rest.b2rec_item', pid_value=draft_publish_data['id']), headers=headers) assert record_get_res.status_code == 200 record_get_data = json.loads( record_get_res.get_data(as_text=True)) # make sure that templates are in the ES list(current_search.put_templates()) # test that a record is accessible through the rest api file1 = record_get_data['files'][0] # download once client.get(url_for('invenio_files_rest.object_api', bucket_id=file1['bucket'], key=file1['key']), headers=com_admin_headers) # make sure that the queue contains the event assert list( current_queues.queues['stats-file-download'].consume()) # download again client.get(url_for('invenio_files_rest.object_api', bucket_id=file1['bucket'], key=file1['key']), headers=com_admin_headers) process_events(['file-download']) current_search_client.indices.refresh('*') # make sure that new index for events is created in ES current_search_client.indices.exists( index='events-stats-file-download') aggregate_events(['file-download-agg']) current_search_client.indices.refresh('*') # make sure that new aggregation index is created in ES current_search_client.indices.exists( index='stats-file-download') stats_ret = client.post(url_for('invenio_stats.stat_query'), data=json.dumps({ 'mystat': { 'stat': 'bucket-file-download-total', 'params': { 'start_date': '2017-01-01', 'bucket_id': file1['bucket'], } } }), headers=stats_headers) stats_ret_data = json.loads(stats_ret.get_data(as_text=True)) assert stats_ret_data['mystat']['buckets'][0]['value'] == 1.0
def test_overwriting_aggregations(app, es, event_queues, sequential_ids): """Check that the StatAggregator correctly starts from bookmark. 1. Create sample file download event and process it. 2. Run aggregator and write count, in aggregation index. 3. Create new events and repeat procedure to assert that the results within the interval of the previous events overwrite the aggregation, by checking that the document version has increased. """ for t in current_search.put_templates(ignore=[400]): pass class NewDate(datetime.datetime): """datetime.datetime mock.""" # Aggregate at 12:00, thus the day will be aggregated again later current_date = (2017, 6, 2, 12) @classmethod def utcnow(cls): return cls(*cls.current_date) # Send some events event_type = 'file-download' events = [ _create_file_download_event(date) for date in [(2017, 6, 1), (2017, 6, 2, 10)] ] current_queues.declare() current_stats.publish(event_type, events) process_events(['file-download']) current_search_client.indices.flush(index='*') with patch('datetime.datetime', NewDate): aggregate_events(['file-download-agg']) # Send new events, some on the last aggregated day and some far # in the future. res = current_search_client.search(index='stats-file-download', version=True) for hit in res['hits']['hits']: if 'file_id' in hit['_source'].keys(): assert hit['_version'] == 1 new_events = [ _create_file_download_event(date) for date in [ (2017, 6, 2, 15), # second event on the same date (2017, 7, 1) ] ] current_stats.publish(event_type, new_events) process_events(['file-download']) current_search_client.indices.flush(index='*') # Aggregate again. The aggregation should start from the last bookmark. NewDate.current_date = (2017, 7, 2) with patch('datetime.datetime', NewDate): aggregate_events(['file-download-agg']) current_search_client.indices.flush(index='*') res = current_search_client.search( index='stats-file-download', doc_type='file-download-day-aggregation', version=True) for hit in res['hits']['hits']: if hit['_source']['timestamp'] == '2017-06-02T00:00:00': assert hit['_version'] == 2 assert hit['_source']['count'] == 2 else: assert hit['_version'] == 1