Example #1
0
def test_events_indexer_id_windowing(app, mock_event_queue):
    """Check that EventsIndexer applies time windows to ids."""

    indexer = EventsIndexer(mock_event_queue,
                            preprocessors=[],
                            double_click_window=180)

    # Generated docs will be registered in this list
    received_docs = []

    def bulk(client, generator, *args, **kwargs):
        received_docs.extend(generator)

    mock_event_queue.consume.return_value = [
        _create_file_download_event(date) for date in [
            # Those two events will be in the same window
            (2017, 6, 1, 0, 11, 3),
            (2017, 6, 1, 0, 9, 1),
            # Those two events will be in the same window
            (2017, 6, 2, 0, 12, 10),
            (2017, 6, 2, 0, 13, 3),
            (2017, 6, 2, 0, 30, 3)
        ]
    ]

    with patch('elasticsearch.helpers.bulk', side_effect=bulk):
        indexer.run()

    assert len(received_docs) == 5
    ids = set(doc['_id'] for doc in received_docs)
    assert len(ids) == 3
Example #2
0
def test_index_prefix(config_with_index_prefix, app, event_queues,
                      es_with_templates):
    es = es_with_templates
    search = Search(using=es)

    # 1) publish events in the queue
    current_stats.publish('file-download', [
        _create_file_download_event(date)
        for date in [(2018, 1, 1), (2018, 1, 2), (2018, 1, 3), (2018, 1, 4)]
    ])

    queue = current_queues.queues['stats-file-download']
    assert get_queue_size('stats-file-download') == 4

    # 2) preprocess events
    indexer = EventsIndexer(queue, preprocessors=[flag_machines, flag_robots])
    indexer.run()
    es.indices.refresh(index='*')

    assert get_queue_size('stats-file-download') == 0

    index_prefix = config_with_index_prefix['SEARCH_INDEX_PREFIX']
    index_name = index_prefix + 'events-stats-file-download'

    assert es.indices.exists(index_name + '-2018-01-01')
    assert es.indices.exists(index_name + '-2018-01-02')
    assert es.indices.exists(index_name + '-2018-01-03')
    assert es.indices.exists(index_name + '-2018-01-04')
    assert es.indices.exists_alias(name=index_name)

    # 3) aggregate events
    aggregate_events(['file-download-agg'])
    es.indices.refresh(index='*')
    es.indices.exists(index_prefix + 'stats-file-download-2018-01')

    # 4) queries
    query_configs = register_queries()
    histo_query = ESDateHistogramQuery(query_name='test_histo',
                                       **query_configs[0]['query_config'])
    results = histo_query.run(bucket_id='B0000000000000000000000000000001',
                              file_key='test.pdf',
                              start_date=datetime.datetime(2018, 1, 1),
                              end_date=datetime.datetime(2018, 1, 3))
    assert len(results['buckets'])
    for day_result in results['buckets']:
        assert int(day_result['value']) == 1

    terms_query = ESTermsQuery(query_name='test_total_count',
                               **query_configs[1]['query_config'])
    results = terms_query.run(bucket_id='B0000000000000000000000000000001',
                              start_date=datetime.datetime(2018, 1, 1),
                              end_date=datetime.datetime(2018, 1, 7))
    assert int(results['buckets'][0]['value']) == 4
Example #3
0
def test_double_clicks(app, mock_event_queue, es):
    """Test that events occurring within a time window are counted as 1."""
    event_type = 'file-download'
    events = [
        _create_file_download_event(date)
        for date in [(2000, 6, 1, 10, 0,
                      10), (2000, 6, 1, 10, 0,
                            11), (2000, 6, 1, 10, 0,
                                  19), (2000, 6, 1, 10, 0, 22)]
    ]
    current_queues.declare()
    current_stats.publish(event_type, events)
    process_events(['file-download'])
    es.indices.refresh(index='*')
    res = es.search(index='events-stats-file-download-2000-06-01', )
    assert res['hits']['total'] == 2
Example #4
0
def test_bookmark_removal(app, es_with_templates, mock_event_queue):
    """Remove aggregation bookmark and restart aggregation.

    This simulates the scenario where aggregations have been created but the
    the bookmarks have not been set due to an error.
    """
    mock_event_queue.consume.return_value = [
        _create_file_download_event(date) for date in [
            (2017, 6, 2, 15),  # second event on the same date
            (2017, 7, 1)
        ]
    ]
    indexer = EventsIndexer(mock_event_queue)
    indexer.run()
    current_search_client.indices.refresh(index='*')

    def aggregate_and_check_version(expected_version):
        # Aggregate events
        StatAggregator(name='file-download-agg',
                       event='file-download',
                       aggregation_field='file_id',
                       aggregation_interval='day',
                       query_modifiers=[]).run()
        current_search_client.indices.refresh(index='*')
        res = current_search_client.search(
            index='stats-file-download',
            doc_type='file-download-day-aggregation',
            version=True)
        for hit in res['hits']['hits']:
            assert hit['_version'] == expected_version

    aggregate_and_check_version(1)
    aggregate_and_check_version(1)
    # Delete all bookmarks
    bookmarks = Search(
        using=current_search_client,
        index='stats-file-download',
        doc_type='file-download-agg-bookmark').query('match_all')
    for bookmark in bookmarks:
        res = current_search_client.delete(
            index=bookmark.meta.index,
            id=bookmark.meta.id,
            doc_type='file-download-agg-bookmark')
    current_search_client.indices.refresh(index='*')
    # the aggregations should have been overwritten
    aggregate_and_check_version(2)
Example #5
0
def test_failing_processors(app, es, event_queues, caplog):
    """Test events that raise an exception when processed."""
    search = Search(using=es)

    current_queues.declare()
    current_stats.publish('file-download', [
        _create_file_download_event(date)
        for date in [(2018, 1, 1), (2018, 1, 2), (2018, 1, 3), (2018, 1, 4)]
    ])

    def _raises_on_second_call(doc):
        if _raises_on_second_call.calls == 1:
            _raises_on_second_call.calls += 1
            raise Exception('mocked-exception')
        _raises_on_second_call.calls += 1
        return doc

    _raises_on_second_call.calls = 0

    queue = current_queues.queues['stats-file-download']
    indexer = EventsIndexer(queue, preprocessors=[_raises_on_second_call])

    current_search.flush_and_refresh(index='*')
    assert get_queue_size('stats-file-download') == 4
    assert not es.indices.exists('events-stats-file-download-2018-01-01')
    assert not es.indices.exists('events-stats-file-download-2018-01-02')
    assert not es.indices.exists('events-stats-file-download-2018-01-03')
    assert not es.indices.exists('events-stats-file-download-2018-01-04')
    assert not es.indices.exists_alias(name='events-stats-file-download')

    with caplog.at_level(logging.ERROR):
        indexer.run()  # 2nd event raises exception and is dropped

    # Check that the error was logged
    error_logs = [r for r in caplog.records if r.levelno == logging.ERROR]
    assert len(error_logs) == 1
    assert error_logs[0].msg == 'Error while processing event'
    assert error_logs[0].exc_info[1].args[0] == 'mocked-exception'

    current_search.flush_and_refresh(index='*')
    assert get_queue_size('stats-file-download') == 0
    assert search.index('events-stats-file-download').count() == 3
    assert search.index('events-stats-file-download-2018-01-01').count() == 1
    assert not es.indices.exists('events-stats-file-download-2018-01-02')
    assert search.index('events-stats-file-download-2018-01-03').count() == 1
    assert search.index('events-stats-file-download-2018-01-04').count() == 1
Example #6
0
def test_bookmark_removal(app, es, mock_event_queue):
    """Remove aggregation bookmark and restart aggregation.

    This simulates the scenario where aggregations have been created but the
    the bookmarks have not been set due to an error.
    """
    mock_event_queue.consume.return_value = [
        _create_file_download_event(date) for date in
        [(2017, 6, 2, 15),  # second event on the same date
         (2017, 7, 1)]
    ]
    indexer = EventsIndexer(mock_event_queue)
    indexer.run()
    current_search.flush_and_refresh(index='*')

    def aggregate_and_check_version(expected_version):
        StatAggregator(
            field='file_id',
            interval='day',
            name='file-download-agg',
            event='file-download',
            query_modifiers=[],
        ).run()
        current_search.flush_and_refresh(index='*')
        res = es.search(
            index='stats-file-download', version=True)
        for hit in res['hits']['hits']:
            assert hit['_version'] == expected_version

    aggregate_and_check_version(1)
    aggregate_and_check_version(1)
    # Delete all bookmarks
    bookmarks = Search(using=es, index='stats-bookmarks') \
        .filter('term', aggregation_type='file-download-agg') \
        .execute()

    for bookmark in bookmarks:
        es.delete(
            index=bookmark.meta.index, id=bookmark.meta.id,
            doc_type=get_doctype(bookmark.meta.doc_type)
        )

    current_search.flush_and_refresh(index='*')
    # the aggregations should have been overwritten
    aggregate_and_check_version(2)
Example #7
0
def test_metric_aggregations(app, event_queues, es_with_templates):
    """Test aggregation metrics."""
    es = es_with_templates
    current_stats.publish('file-download', [
        _create_file_download_event(date, user_id='1')
        for date in [(2018, 1, 1, 12,
                      10), (2018, 1, 1, 12,
                            20), (2018, 1, 1, 12,
                                  30), (2018, 1, 1, 13,
                                        10), (2018, 1, 1, 13,
                                              20), (2018, 1, 1, 13,
                                                    30), (2018, 1, 1, 14, 10),
                     (2018, 1, 1, 14,
                      20), (2018, 1, 1, 14,
                            30), (2018, 1, 1, 15,
                                  10), (2018, 1, 1, 15,
                                        20), (2018, 1, 1, 15, 30)]
    ])
    process_events(['file-download'])
    es.indices.refresh(index='*')

    StatAggregator(name='file-download-agg',
                   client=current_search_client,
                   event='file-download',
                   aggregation_field='file_id',
                   metric_aggregation_fields={
                       'unique_count': ('cardinality', 'unique_session_id', {
                           'precision_threshold': 1000
                       }),
                       'volume': ('sum', 'size', {})
                   },
                   aggregation_interval='day').run()
    es.indices.refresh(index='*')

    query = Search(using=current_search_client,
                   index='stats-file-download',
                   doc_type='file-download-day-aggregation')

    results = query.execute()
    assert len(results) == 1
    assert results[0].count == 12  # 3 views over 4 differnet hour slices
    assert results[0].unique_count == 4  # 4 different hour slices accessed
    assert results[0].volume == 9000 * 12
Example #8
0
def test_overwriting_aggregations(app, mock_event_queue, es_with_templates):
    """Check that the StatAggregator correctly starts from bookmark.

    1. Create sample file download event and process it.
    2. Run aggregator and write count, in aggregation index.
    3. Create new events and repeat procedure to assert that the
        results within the interval of the previous events
        overwrite the aggregation,
        by checking that the document version has increased.
    """
    class NewDate(datetime.datetime):
        """datetime.datetime mock."""
        # Aggregate at 12:00, thus the day will be aggregated again later
        current_date = (2017, 6, 2, 12)

        @classmethod
        def utcnow(cls):
            return cls(*cls.current_date)

    # Send some events
    event_type = 'file-download'
    mock_event_queue.consume.return_value = [
        _create_file_download_event(date)
        for date in [(2017, 6, 1), (2017, 6, 2, 10)]
    ]

    indexer = EventsIndexer(mock_event_queue)
    indexer.run()
    current_search_client.indices.refresh(index='*')

    # Aggregate events
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])
    current_search_client.indices.refresh(index='*')

    # Send new events, some on the last aggregated day and some far
    # in the future.
    res = current_search_client.search(index='stats-file-download',
                                       version=True)
    for hit in res['hits']['hits']:
        if 'file_id' in hit['_source'].keys():
            assert hit['_version'] == 1

    mock_event_queue.consume.return_value = [
        _create_file_download_event(date) for date in [
            (2017, 6, 2, 15),  # second event on the same date
            (2017, 7, 1)
        ]
    ]
    indexer = EventsIndexer(mock_event_queue)
    indexer.run()
    current_search_client.indices.refresh(index='*')

    # Aggregate again. The aggregation should start from the last bookmark.
    NewDate.current_date = (2017, 7, 2)
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])
    current_search_client.indices.refresh(index='*')

    res = current_search_client.search(
        index='stats-file-download',
        doc_type='file-download-day-aggregation',
        version=True)
    for hit in res['hits']['hits']:
        if hit['_source']['timestamp'] == '2017-06-02T00:00:00':
            assert hit['_version'] == 2
            assert hit['_source']['count'] == 2
        else:
            assert hit['_version'] == 1
Example #9
0
def test_events_process(script_info, event_queues, es_with_templates):
    """Test "events process" CLI command."""
    es = es_with_templates
    search = Search(using=es)
    runner = CliRunner()

    # Invalid argument
    result = runner.invoke(
        stats, ['events', 'process', 'invalid-event-type', '--eager'],
        obj=script_info)
    assert result.exit_code == 2
    assert 'Invalid event type(s):' in result.output

    current_stats.publish('file-download', [
        _create_file_download_event(date)
        for date in [(2018, 1, 1, 10), (2018, 1, 1, 12), (2018, 1, 1, 14)]
    ])
    current_stats.publish('record-view', [
        _create_record_view_event(date)
        for date in [(2018, 1, 1, 10), (2018, 1, 1, 12), (2018, 1, 1, 14)]
    ])

    result = runner.invoke(stats,
                           ['events', 'process', 'file-download', '--eager'],
                           obj=script_info)
    assert result.exit_code == 0

    current_search.flush_and_refresh(index='*')

    assert search.index('events-stats-file-download-2018-01-01').count() == 3
    assert search.index('events-stats-file-download').count() == 3
    assert not es.indices.exists('events-stats-record-view-2018-01-01')
    assert not es.indices.exists_alias(name='events-stats-record-view')

    result = runner.invoke(stats,
                           ['events', 'process', 'record-view', '--eager'],
                           obj=script_info)
    assert result.exit_code == 0

    current_search.flush_and_refresh(index='*')
    assert search.index('events-stats-file-download-2018-01-01').count() == 3
    assert search.index('events-stats-file-download').count() == 3
    assert search.index('events-stats-record-view-2018-01-01').count() == 3
    assert search.index('events-stats-record-view').count() == 3

    # Create some more events
    current_stats.publish('file-download',
                          [_create_file_download_event((2018, 2, 1, 12))])
    current_stats.publish('record-view',
                          [_create_record_view_event((2018, 2, 1, 10))])

    # Process all event types via a celery task
    result = runner.invoke(stats, ['events', 'process'], obj=script_info)
    assert result.exit_code == 0

    current_search.flush_and_refresh(index='*')
    assert search.index('events-stats-file-download-2018-01-01').count() == 3
    assert search.index('events-stats-file-download-2018-02-01').count() == 1
    assert search.index('events-stats-file-download').count() == 4
    assert search.index('events-stats-record-view-2018-01-01').count() == 3
    assert search.index('events-stats-record-view-2018-02-01').count() == 1
    assert search.index('events-stats-record-view').count() == 4