def get_data_transfer(): """Get file transfer volume in TB.""" time_range = {'gte': current_metrics.metrics_start_date.isoformat()} search = Search( using=current_search_client, index=build_alias_name('stats-file-download-*')).filter( 'range', timestamp=time_range, ).filter( 'term', is_parent=False, ) search.aggs.metric('download_volume', 'sum', field='volume') result = search[:0].execute().aggregations.to_dict() download_volume = result.get('download_volume', {}).get('value', 0) search = Search(using=current_search_client, index=build_alias_name('records')).filter( 'range', created=time_range) search.aggs.metric('upload_volume', 'sum', field='size') result = search[:0].execute().aggregations.to_dict() upload_volume = result.get('upload_volume', {}).get('value', 0) return int(download_volume + upload_volume)
def find_aliases_for_index(index_name, aliases): if isinstance(aliases, str): return None for key, values in aliases.items(): if key == index_name: return [build_alias_name(key)] else: found_aliases = find_aliases_for_index(index_name, values) if isinstance(found_aliases, list): found_aliases.append(build_alias_name(key)) return found_aliases
def test_basic_stats(app, db, es, locations, event_queues, minimal_record): """Test basic statistics results.""" search = Search(using=es) records = create_stats_fixtures( # (10 * 2) -> 20 records and (10 * 2 * 3) -> 60 files metadata=minimal_record, n_records=10, n_versions=2, n_files=3, event_data={'user_id': '1'}, # 4 event timestamps start_date=datetime(2018, 1, 1, 13), end_date=datetime(2018, 1, 1, 15), interval=timedelta(minutes=30)) # Events indices prefix = app.config['SEARCH_INDEX_PREFIX'] # 2 versions * 10 records * 3 files * 4 events -> 240 assert search.index(prefix + 'events-stats-file-download').count() == 240 # 2 versions * 10 records * 4 events -> 80 assert search.index(prefix + 'events-stats-record-view').count() == 80 # Aggregations indices # (2 versions + 1 concept) * 10 records -> 30 documents + 2 bookmarks # 30d assert search.index(prefix + 'stats-file-download').count() == 30 # 30d assert search.index(prefix + 'stats-record-view').count() == 30 # 2bm + 2bm assert search.index(prefix + 'stats-bookmarks').count() == 4 # Records index for _, record, _ in records: doc = \ current_search_client.get( index=build_alias_name('records'), id=str(record.id), params={'_source_includes': '_stats'} ) assert doc['_source']['_stats'] == { # 4 view events 'views': 4.0, 'version_views': 8.0, # 4 view events over 2 different hours 'unique_views': 2.0, 'version_unique_views': 2.0, # 4 download events * 3 files 'downloads': 12.0, 'version_downloads': 24.0, # 4 download events * 3 files over 2 different hours 'unique_downloads': 2.0, 'version_unique_downloads': 2.0, # 4 download events * 3 files * 10 bytes 'volume': 120.0, 'version_volume': 240.0, }
def test_large_stats(app, db, es, locations, event_queues, minimal_record): """Test a larger number of events, aggregations, and results.""" search = Search(using=es) records = create_stats_fixtures( # (3 * 4) -> 12 records and (3 * 4 * 2) -> 24 files metadata=minimal_record, n_records=3, n_versions=4, n_files=2, event_data={'user_id': '1'}, # (31 + 30) * 2 -> 122 event timestamps (61 days and 2 events/day) start_date=datetime(2018, 3, 1), end_date=datetime(2018, 5, 1), interval=timedelta(hours=12)) # Events indices prefix = app.config['SEARCH_INDEX_PREFIX'] # 4 versions * 3 records * 2 files * 122 events -> 2928 assert search.index(prefix + 'events-stats-file-download').count() == 2928 # 4 versions * 3 records * 122 events -> 1464 assert search.index(prefix + 'events-stats-record-view').count() == 1464 # Aggregations indices # (4 versions + 1 concept) * 3 records -> 15 documents + 2 bookmarks q = search.index(prefix + 'stats-file-download') q = q.doc_type('file-download-day-aggregation') assert q.count() == 915 # 61 days * 15 records q = search.index(prefix + 'stats-record-view') q = q.doc_type('record-view-day-aggregation') assert q.count() == 915 # 61 days * 15 records # Records index for _, record, _ in records: doc = \ current_search_client.get( index=build_alias_name('records'), id=str(record.id), params={'_source_includes': '_stats'} ) assert doc['_source']['_stats'] == { # 4 view events 'views': 122.0, 'version_views': 488.0, # 4 view events over 2 different hours 'unique_views': 122.0, 'version_unique_views': 122.0, # 4 download events * 3 files 'downloads': 244.0, 'version_downloads': 976.0, # 4 download events * 3 files over 2 different hours 'unique_downloads': 122.0, 'version_unique_downloads': 122.0, # 4 download events * 3 files * 10 bytes 'volume': 2440.0, 'version_volume': 9760.0, }
def __init__(self, name, index, client=None, *args, **kwargs): """Constructor. :param index: queried index. :param client: elasticsearch client used to query. """ self.name = name self.index = build_alias_name(index) self.client = client or current_search_client
def wait_es_refresh(index): """Block code execution until the ES index is refreshed. Useful when performing searching on an index of a record just indexed. WARNING: this will block code execution (HTTP requests if in request context. By default, it should be max 1 sec.) """ prefixed_index = build_alias_name(index) current_search_client.indices.refresh(index=prefixed_index)
def __init__(self, index, document_id=None, client=None, force=False, initial_state=None): """Synchronization job state in ElasticSearch.""" self.index = build_alias_name(index) self.document_id = document_id or 'state' self.doc_type = '_doc' self.force = force self.client = client or current_search_client self._state = {}
def get_visitors(): """Get number of unique zenodo users.""" time_range = {'gte': current_metrics.metrics_start_date.isoformat()} search = Search(using=current_search_client, index=build_alias_name('events-stats-*')).filter( 'range', timestamp=time_range) search.aggs.metric('visitors_count', 'cardinality', field='visitor_id') result = search[:0].execute() if 'visitors_count' not in result.aggregations: return 0 return int(result.aggregations.visitors_count.value)
def exists(self, index=None, **kwargs): """Check if an index exists. :param index: the index or index name to refresh. if not given the indexer record class index will be used. """ if not index: index_name = self.record_cls.index._name elif isinstance(index, Index): index_name = index._name else: index_name = index index_name = build_alias_name(index_name) return self.client.indices.exists(index=index_name, **kwargs)
def get_src(name, prefix): index_name = None src_alias_name = build_alias_name(name, prefix=prefix) if old_client.indices.exists(src_alias_name): index_name = src_alias_name elif old_client.indices.exists_alias(src_alias_name): indexes = list( old_client.indices.get_alias(name=src_alias_name).keys()) if len(indexes) > 1: raise Exception( 'Multiple indexes found for alias {}.'.format( src_alias_name)) index_name = indexes[0] else: raise Exception( "alias or index ({}) doesn't exist".format(src_alias_name)) return dict(index=index_name, )
def reindex_pid(pid_type, RecordClass): index_name = None indexer = RecordIndexer() for pid in tqdm.tqdm(PersistentIdentifier.query.filter_by( pid_type=pid_type, object_type='rec', status=PIDStatus.REGISTERED.value)): record = RecordClass.get_record(pid.object_uuid) if only and str(record.id) != only: continue try: index_name, doc_type = indexer.record_to_index(record) index_name = build_alias_name(index_name) # print('Indexing', record.get('id'), 'into', index_name) indexer.index(record) except: with open('/tmp/indexing-error.json', 'a') as f: print(json.dumps(record.dumps(), indent=4, ensure_ascii=False), file=f) traceback.print_exc(file=f) if raise_on_error: raise if index_name: current_search_client.indices.refresh(index_name) current_search_client.indices.flush(index_name)
def reindex_pid(pid_type, RecordClass, only: bool = False, raise_on_error=None): index_name = None indexer = RecordIndexer() pids = PersistentIdentifier.query.filter_by( pid_type=pid_type, object_type='rec', status=PIDStatus.REGISTERED.value).all() for pid in tqdm(pids): try: record = RecordClass.get_record(pid.object_uuid) except NoResultFound: continue keywords = record.get("keywords") if keywords: if keywords == "Keywords must be fixed in draft mode": del record["keywords"] if only and str(record.id) != only: continue try: index_name, doc_type = indexer.record_to_index(record) index_name = build_alias_name(index_name) # print('Indexing', record.get('id'), 'into', index_name) indexer.index(record) except: with open('/tmp/indexing-error.json', 'a') as f: print(json.dumps(record.dumps(), indent=4, ensure_ascii=False), file=f) traceback.print_exc(file=f) if raise_on_error: raise if index_name: current_search_client.indices.refresh(index_name) current_search_client.indices.flush(index_name)