Ejemplo n.º 1
0
    def get_data_transfer():
        """Get file transfer volume in TB."""
        time_range = {'gte': current_metrics.metrics_start_date.isoformat()}

        search = Search(
            using=current_search_client,
            index=build_alias_name('stats-file-download-*')).filter(
                'range',
                timestamp=time_range,
            ).filter(
                'term',
                is_parent=False,
            )
        search.aggs.metric('download_volume', 'sum', field='volume')
        result = search[:0].execute().aggregations.to_dict()
        download_volume = result.get('download_volume', {}).get('value', 0)

        search = Search(using=current_search_client,
                        index=build_alias_name('records')).filter(
                            'range', created=time_range)
        search.aggs.metric('upload_volume', 'sum', field='size')
        result = search[:0].execute().aggregations.to_dict()
        upload_volume = result.get('upload_volume', {}).get('value', 0)

        return int(download_volume + upload_volume)
Ejemplo n.º 2
0
 def find_aliases_for_index(index_name, aliases):
     if isinstance(aliases, str):
         return None
     for key, values in aliases.items():
         if key == index_name:
             return [build_alias_name(key)]
         else:
             found_aliases = find_aliases_for_index(index_name, values)
             if isinstance(found_aliases, list):
                 found_aliases.append(build_alias_name(key))
                 return found_aliases
Ejemplo n.º 3
0
def test_basic_stats(app, db, es, locations, event_queues, minimal_record):
    """Test basic statistics results."""
    search = Search(using=es)
    records = create_stats_fixtures(
        # (10 * 2) -> 20 records and (10 * 2 * 3) -> 60 files
        metadata=minimal_record,
        n_records=10,
        n_versions=2,
        n_files=3,
        event_data={'user_id': '1'},
        # 4 event timestamps
        start_date=datetime(2018, 1, 1, 13),
        end_date=datetime(2018, 1, 1, 15),
        interval=timedelta(minutes=30))

    # Events indices
    prefix = app.config['SEARCH_INDEX_PREFIX']

    # 2 versions * 10 records * 3 files * 4 events -> 240
    assert search.index(prefix + 'events-stats-file-download').count() == 240
    # 2 versions * 10 records * 4 events -> 80
    assert search.index(prefix + 'events-stats-record-view').count() == 80

    # Aggregations indices
    # (2 versions + 1 concept) * 10 records -> 30 documents + 2 bookmarks

    # 30d
    assert search.index(prefix + 'stats-file-download').count() == 30

    # 30d
    assert search.index(prefix + 'stats-record-view').count() == 30

    # 2bm + 2bm
    assert search.index(prefix + 'stats-bookmarks').count() == 4

    # Records index
    for _, record, _ in records:
        doc = \
             current_search_client.get(
                index=build_alias_name('records'),
                id=str(record.id),
                params={'_source_includes': '_stats'}
                )
        assert doc['_source']['_stats'] == {
            # 4 view events
            'views': 4.0,
            'version_views': 8.0,
            # 4 view events over 2 different hours
            'unique_views': 2.0,
            'version_unique_views': 2.0,
            # 4 download events * 3 files
            'downloads': 12.0,
            'version_downloads': 24.0,
            # 4 download events * 3 files over 2 different hours
            'unique_downloads': 2.0,
            'version_unique_downloads': 2.0,
            # 4 download events * 3 files * 10 bytes
            'volume': 120.0,
            'version_volume': 240.0,
        }
Ejemplo n.º 4
0
def test_large_stats(app, db, es, locations, event_queues, minimal_record):
    """Test a larger number of events, aggregations, and results."""
    search = Search(using=es)
    records = create_stats_fixtures(
        # (3 * 4) -> 12 records and (3 * 4 * 2) -> 24 files
        metadata=minimal_record,
        n_records=3,
        n_versions=4,
        n_files=2,
        event_data={'user_id': '1'},
        # (31 + 30) * 2 -> 122 event timestamps (61 days and 2 events/day)
        start_date=datetime(2018, 3, 1),
        end_date=datetime(2018, 5, 1),
        interval=timedelta(hours=12))

    # Events indices
    prefix = app.config['SEARCH_INDEX_PREFIX']

    # 4 versions * 3 records * 2 files * 122 events -> 2928
    assert search.index(prefix + 'events-stats-file-download').count() == 2928
    # 4 versions * 3 records * 122 events -> 1464
    assert search.index(prefix + 'events-stats-record-view').count() == 1464

    # Aggregations indices
    # (4 versions + 1 concept) * 3 records -> 15 documents + 2 bookmarks
    q = search.index(prefix + 'stats-file-download')
    q = q.doc_type('file-download-day-aggregation')
    assert q.count() == 915  # 61 days * 15 records
    q = search.index(prefix + 'stats-record-view')
    q = q.doc_type('record-view-day-aggregation')
    assert q.count() == 915  # 61 days * 15 records

    # Records index
    for _, record, _ in records:
        doc = \
             current_search_client.get(
                index=build_alias_name('records'),
                id=str(record.id),
                params={'_source_includes': '_stats'}
                )
        assert doc['_source']['_stats'] == {
            # 4 view events
            'views': 122.0,
            'version_views': 488.0,
            # 4 view events over 2 different hours
            'unique_views': 122.0,
            'version_unique_views': 122.0,
            # 4 download events * 3 files
            'downloads': 244.0,
            'version_downloads': 976.0,
            # 4 download events * 3 files over 2 different hours
            'unique_downloads': 122.0,
            'version_unique_downloads': 122.0,
            # 4 download events * 3 files * 10 bytes
            'volume': 2440.0,
            'version_volume': 9760.0,
        }
Ejemplo n.º 5
0
    def __init__(self, name, index, client=None, *args, **kwargs):
        """Constructor.

        :param index: queried index.
        :param client: elasticsearch client used to query.
        """
        self.name = name
        self.index = build_alias_name(index)
        self.client = client or current_search_client
Ejemplo n.º 6
0
def wait_es_refresh(index):
    """Block code execution until the ES index is refreshed.

    Useful when performing searching on an index of a record just indexed.
    WARNING: this will block code execution (HTTP requests if in request
    context. By default, it should be max 1 sec.)
    """
    prefixed_index = build_alias_name(index)
    current_search_client.indices.refresh(index=prefixed_index)
Ejemplo n.º 7
0
 def __init__(self,
              index,
              document_id=None,
              client=None,
              force=False,
              initial_state=None):
     """Synchronization job state in ElasticSearch."""
     self.index = build_alias_name(index)
     self.document_id = document_id or 'state'
     self.doc_type = '_doc'
     self.force = force
     self.client = client or current_search_client
     self._state = {}
Ejemplo n.º 8
0
    def get_visitors():
        """Get number of unique zenodo users."""
        time_range = {'gte': current_metrics.metrics_start_date.isoformat()}

        search = Search(using=current_search_client,
                        index=build_alias_name('events-stats-*')).filter(
                            'range', timestamp=time_range)

        search.aggs.metric('visitors_count', 'cardinality', field='visitor_id')
        result = search[:0].execute()

        if 'visitors_count' not in result.aggregations:
            return 0

        return int(result.aggregations.visitors_count.value)
Ejemplo n.º 9
0
    def exists(self, index=None, **kwargs):
        """Check if an index exists.

        :param index: the index or index name to refresh. if not given the
                      indexer record class index will be used.
        """
        if not index:
            index_name = self.record_cls.index._name
        elif isinstance(index, Index):
            index_name = index._name
        else:
            index_name = index

        index_name = build_alias_name(index_name)

        return self.client.indices.exists(index=index_name, **kwargs)
Ejemplo n.º 10
0
 def get_src(name, prefix):
     index_name = None
     src_alias_name = build_alias_name(name, prefix=prefix)
     if old_client.indices.exists(src_alias_name):
         index_name = src_alias_name
     elif old_client.indices.exists_alias(src_alias_name):
         indexes = list(
             old_client.indices.get_alias(name=src_alias_name).keys())
         if len(indexes) > 1:
             raise Exception(
                 'Multiple indexes found for alias {}.'.format(
                     src_alias_name))
         index_name = indexes[0]
     else:
         raise Exception(
             "alias or index ({}) doesn't exist".format(src_alias_name))
     return dict(index=index_name, )
Ejemplo n.º 11
0
 def reindex_pid(pid_type, RecordClass):
     index_name = None
     indexer = RecordIndexer()
     for pid in tqdm.tqdm(PersistentIdentifier.query.filter_by(
             pid_type=pid_type, object_type='rec', status=PIDStatus.REGISTERED.value)):
         record = RecordClass.get_record(pid.object_uuid)
         if only and str(record.id) != only:
             continue
         try:
             index_name, doc_type = indexer.record_to_index(record)
             index_name = build_alias_name(index_name)
             # print('Indexing', record.get('id'), 'into', index_name)
             indexer.index(record)
         except:
             with open('/tmp/indexing-error.json', 'a') as f:
                 print(json.dumps(record.dumps(), indent=4, ensure_ascii=False), file=f)
                 traceback.print_exc(file=f)
             if raise_on_error:
                 raise
     if index_name:
         current_search_client.indices.refresh(index_name)
         current_search_client.indices.flush(index_name)
Ejemplo n.º 12
0
def reindex_pid(pid_type,
                RecordClass,
                only: bool = False,
                raise_on_error=None):
    index_name = None
    indexer = RecordIndexer()
    pids = PersistentIdentifier.query.filter_by(
        pid_type=pid_type,
        object_type='rec',
        status=PIDStatus.REGISTERED.value).all()
    for pid in tqdm(pids):
        try:
            record = RecordClass.get_record(pid.object_uuid)
        except NoResultFound:
            continue
        keywords = record.get("keywords")
        if keywords:
            if keywords == "Keywords must be fixed in draft mode":
                del record["keywords"]
        if only and str(record.id) != only:
            continue
        try:
            index_name, doc_type = indexer.record_to_index(record)
            index_name = build_alias_name(index_name)
            # print('Indexing', record.get('id'), 'into', index_name)
            indexer.index(record)
        except:
            with open('/tmp/indexing-error.json', 'a') as f:
                print(json.dumps(record.dumps(), indent=4, ensure_ascii=False),
                      file=f)
                traceback.print_exc(file=f)
            if raise_on_error:
                raise
    if index_name:
        current_search_client.indices.refresh(index_name)
        current_search_client.indices.flush(index_name)