Beispiel #1
0
def import_events(event_type, csv_dir, chunk_size):
    r"""Import stats events from a directory of CSV files.

    Available event types: "file-download", "record-view"

    The following columns should always be present:

    \b
    - ipAddress
    - userAgent
    - url ("https://zenodo.org/record/1234/files/article.pdf")
    - timestamp (1388506249)
    - referrer ("Google", "example.com", etc)
    """
    csv_files = glob.glob(csv_dir + '/*.csv')
    with click.progressbar(csv_files, len(csv_files)) as csv_files_bar:
        for csv_path in csv_files_bar:
            with open(csv_path, 'r' if PY3 else 'rb') as fp:
                reader = csv.DictReader(fp, delimiter=',')
                events = filter(
                    None, map(EVENT_TYPE_BUILDERS[event_type], reader))
                for event_chunk in chunkify(events, chunk_size):
                    current_stats.publish(event_type, event_chunk)
    click.secho(
        'Run the "invenio_stats.tasks.process_events" to index the events...',
        fg='yellow')
Beispiel #2
0
def test_publish_and_consume_events(app, event_entrypoints):
    """Test that events are published and consumed properly."""
    try:
        event_type = 'file-download'
        events = [{"payload": "test {}".format(idx)} for idx in range(3)]
        current_queues.declare()
        current_stats.publish(event_type, events)
        assert list(current_stats.consume(event_type)) == events
    finally:
        current_queues.delete()
Beispiel #3
0
def test_publish_and_consume_events(app, event_entrypoints):
    """Test that events are published and consumed properly."""
    try:
        event_type = 'file-download'
        events = [{"payload": "test {}".format(idx)} for idx in range(3)]
        current_queues.declare()
        current_stats.publish(event_type, events)
        assert list(current_stats.consume(event_type)) == events
    finally:
        current_queues.delete()
Beispiel #4
0
def test_index_prefix(config_with_index_prefix, app, event_queues,
                      es_with_templates):
    es = es_with_templates
    search = Search(using=es)

    # 1) publish events in the queue
    current_stats.publish('file-download', [
        _create_file_download_event(date)
        for date in [(2018, 1, 1), (2018, 1, 2), (2018, 1, 3), (2018, 1, 4)]
    ])

    queue = current_queues.queues['stats-file-download']
    assert get_queue_size('stats-file-download') == 4

    # 2) preprocess events
    indexer = EventsIndexer(queue, preprocessors=[flag_machines, flag_robots])
    indexer.run()
    es.indices.refresh(index='*')

    assert get_queue_size('stats-file-download') == 0

    index_prefix = config_with_index_prefix['SEARCH_INDEX_PREFIX']
    index_name = index_prefix + 'events-stats-file-download'

    assert es.indices.exists(index_name + '-2018-01-01')
    assert es.indices.exists(index_name + '-2018-01-02')
    assert es.indices.exists(index_name + '-2018-01-03')
    assert es.indices.exists(index_name + '-2018-01-04')
    assert es.indices.exists_alias(name=index_name)

    # 3) aggregate events
    aggregate_events(['file-download-agg'])
    es.indices.refresh(index='*')
    es.indices.exists(index_prefix + 'stats-file-download-2018-01')

    # 4) queries
    query_configs = register_queries()
    histo_query = ESDateHistogramQuery(query_name='test_histo',
                                       **query_configs[0]['query_config'])
    results = histo_query.run(bucket_id='B0000000000000000000000000000001',
                              file_key='test.pdf',
                              start_date=datetime.datetime(2018, 1, 1),
                              end_date=datetime.datetime(2018, 1, 3))
    assert len(results['buckets'])
    for day_result in results['buckets']:
        assert int(day_result['value']) == 1

    terms_query = ESTermsQuery(query_name='test_total_count',
                               **query_configs[1]['query_config'])
    results = terms_query.run(bucket_id='B0000000000000000000000000000001',
                              start_date=datetime.datetime(2018, 1, 1),
                              end_date=datetime.datetime(2018, 1, 7))
    assert int(results['buckets'][0]['value']) == 4
Beispiel #5
0
def publish_filedownload(nb_events, user_id, file_key,
                         file_id, bucket_id, date):
    current_stats.publish('file-download', [dict(
        # When:
        timestamp=(
            date + timedelta(minutes=idx)
        ).isoformat(),
        # What:
        bucket_id=str(bucket_id),
        file_key=file_key,
        file_id=file_id,
        # Who:
        user_id=str(user_id)
    ) for idx in range(nb_events)])
Beispiel #6
0
def publish_filedownload(nb_events, user_id, file_key, file_id, bucket_id,
                         date):
    current_stats.publish(
        'file-download',
        [
            dict(
                # When:
                timestamp=(date + timedelta(minutes=idx)).isoformat(),
                # What:
                bucket_id=str(bucket_id),
                file_key=file_key,
                file_id=file_id,
                # Who:
                user_id=str(user_id)) for idx in range(nb_events)
        ])
Beispiel #7
0
def test_double_clicks(app, mock_event_queue, es):
    """Test that events occurring within a time window are counted as 1."""
    event_type = 'file-download'
    events = [
        _create_file_download_event(date)
        for date in [(2000, 6, 1, 10, 0,
                      10), (2000, 6, 1, 10, 0,
                            11), (2000, 6, 1, 10, 0,
                                  19), (2000, 6, 1, 10, 0, 22)]
    ]
    current_queues.declare()
    current_stats.publish(event_type, events)
    process_events(['file-download'])
    es.indices.refresh(index='*')
    res = es.search(index='events-stats-file-download-2000-06-01', )
    assert res['hits']['total'] == 2
Beispiel #8
0
def test_failing_processors(app, es, event_queues, caplog):
    """Test events that raise an exception when processed."""
    search = Search(using=es)

    current_queues.declare()
    current_stats.publish('file-download', [
        _create_file_download_event(date)
        for date in [(2018, 1, 1), (2018, 1, 2), (2018, 1, 3), (2018, 1, 4)]
    ])

    def _raises_on_second_call(doc):
        if _raises_on_second_call.calls == 1:
            _raises_on_second_call.calls += 1
            raise Exception('mocked-exception')
        _raises_on_second_call.calls += 1
        return doc

    _raises_on_second_call.calls = 0

    queue = current_queues.queues['stats-file-download']
    indexer = EventsIndexer(queue, preprocessors=[_raises_on_second_call])

    current_search.flush_and_refresh(index='*')
    assert get_queue_size('stats-file-download') == 4
    assert not es.indices.exists('events-stats-file-download-2018-01-01')
    assert not es.indices.exists('events-stats-file-download-2018-01-02')
    assert not es.indices.exists('events-stats-file-download-2018-01-03')
    assert not es.indices.exists('events-stats-file-download-2018-01-04')
    assert not es.indices.exists_alias(name='events-stats-file-download')

    with caplog.at_level(logging.ERROR):
        indexer.run()  # 2nd event raises exception and is dropped

    # Check that the error was logged
    error_logs = [r for r in caplog.records if r.levelno == logging.ERROR]
    assert len(error_logs) == 1
    assert error_logs[0].msg == 'Error while processing event'
    assert error_logs[0].exc_info[1].args[0] == 'mocked-exception'

    current_search.flush_and_refresh(index='*')
    assert get_queue_size('stats-file-download') == 0
    assert search.index('events-stats-file-download').count() == 3
    assert search.index('events-stats-file-download-2018-01-01').count() == 1
    assert not es.indices.exists('events-stats-file-download-2018-01-02')
    assert search.index('events-stats-file-download-2018-01-03').count() == 1
    assert search.index('events-stats-file-download-2018-01-04').count() == 1
def test_overwriting_aggregations(app, es, event_queues, sequential_ids):
    """Check that the StatAggregator correctly starts from bookmark.

    1. Create sample file download event and process it.
    2. Run aggregator and write count, in aggregation index.
    3. Create new events and repeat procedure to assert that the
        results within the interval of the previous events
        overwrite the aggregation,
        by checking that the document version has increased.
    """
    for t in current_search.put_templates(ignore=[400]):
        pass

    class NewDate(datetime.datetime):
        """datetime.datetime mock."""
        # Aggregate at 12:00, thus the day will be aggregated again later
        current_date = (2017, 6, 2, 12)

        @classmethod
        def utcnow(cls):
            return cls(*cls.current_date)

    # Send some events
    event_type = 'file-download'
    events = [_create_file_download_event(date) for date in
              [(2017, 6, 1), (2017, 6, 2, 10)]]
    current_queues.declare()
    current_stats.publish(event_type, events)
    process_events(['file-download'])
    current_search_client.indices.flush(index='*')
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])

    # Send new events, some on the last aggregated day and some far
    # in the future.
    res = current_search_client.search(index='stats-file-download',
                                       version=True)
    for hit in res['hits']['hits']:
        if 'file_id' in hit['_source'].keys():
            assert hit['_version'] == 1

    new_events = [_create_file_download_event(date) for date in
                  [(2017, 6, 2, 15),  # second event on the same date
                   (2017, 7, 1)]]
    current_stats.publish(event_type, new_events)
    process_events(['file-download'])
    current_search_client.indices.flush(index='*')

    # Aggregate again. The aggregation should start from the last bookmark.
    NewDate.current_date = (2017, 7, 2)
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])
    current_search_client.indices.flush(index='*')

    res = current_search_client.search(
        index='stats-file-download',
        doc_type='file-download-day-aggregation',
        version=True
    )
    for hit in res['hits']['hits']:
        if hit['_source']['timestamp'] == '2017-06-02T00:00:00':
            assert hit['_version'] == 2
            assert hit['_source']['count'] == 2
        else:
            assert hit['_version'] == 1
Beispiel #10
0
def test_overwriting_aggregations(app, es, event_queues, sequential_ids):
    """Check that the StatAggregator correctly starts from bookmark.

    1. Create sample file download event and process it.
    2. Run aggregator and write count, in aggregation index.
    3. Create new events and repeat procedure to assert that the
        results within the interval of the previous events
        overwrite the aggregation,
        by checking that the document version has increased.
    """
    for t in current_search.put_templates(ignore=[400]):
        pass

    class NewDate(datetime.datetime):
        """datetime.datetime mock."""
        # Aggregate at 12:00, thus the day will be aggregated again later
        current_date = (2017, 6, 2, 12)

        @classmethod
        def utcnow(cls):
            return cls(*cls.current_date)

    # Send some events
    event_type = 'file-download'
    events = [
        _create_file_download_event(date)
        for date in [(2017, 6, 1), (2017, 6, 2, 10)]
    ]
    current_queues.declare()
    current_stats.publish(event_type, events)
    process_events(['file-download'])
    current_search_client.indices.flush(index='*')
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])

    # Send new events, some on the last aggregated day and some far
    # in the future.
    res = current_search_client.search(index='stats-file-download',
                                       version=True)
    for hit in res['hits']['hits']:
        if 'file_id' in hit['_source'].keys():
            assert hit['_version'] == 1

    new_events = [
        _create_file_download_event(date) for date in [
            (2017, 6, 2, 15),  # second event on the same date
            (2017, 7, 1)
        ]
    ]
    current_stats.publish(event_type, new_events)
    process_events(['file-download'])
    current_search_client.indices.flush(index='*')

    # Aggregate again. The aggregation should start from the last bookmark.
    NewDate.current_date = (2017, 7, 2)
    with patch('datetime.datetime', NewDate):
        aggregate_events(['file-download-agg'])
    current_search_client.indices.flush(index='*')

    res = current_search_client.search(
        index='stats-file-download',
        doc_type='file-download-day-aggregation',
        version=True)
    for hit in res['hits']['hits']:
        if hit['_source']['timestamp'] == '2017-06-02T00:00:00':
            assert hit['_version'] == 2
            assert hit['_source']['count'] == 2
        else:
            assert hit['_version'] == 1