Example #1
0
 def elasticsearch(self):
     """
     Indicates whether the Elasticsearch cluster is responsive.
     """
     return {
         'up': ESClientFactory.get().ping(),
     }
Example #2
0
 def add_cart_item(self, catalog: CatalogName, user_id, cart_id, entity_id,
                   entity_type, entity_version):
     """
     Add an item to a cart and return the created item ID
     An error will be raised if the cart does not exist or does not belong to the user
     """
     # TODO: Cart item should have some user readable name
     if cart_id is None:
         cart = self.get_or_create_default_cart(user_id)
     else:
         cart = self.get_cart(user_id, cart_id)
     real_cart_id = cart['CartId']
     if not entity_version:
         # When entity_version is not given, this method will check the data integrity and retrieve the version.
         entity = ESClientFactory.get().get(
             index=config.es_index_name(catalog=catalog,
                                        entity_type=entity_type,
                                        aggregate=True),
             id=entity_id,
             _source=True,
             _source_include=[
                 'contents.files.uuid',  # data file UUID
                 'contents.files.version',  # data file version
                 'contents.projects.document_id',  # metadata file UUID
                 'contents.samples.document_id',  # metadata file UUID
             ])['_source']
         normalized_entity = self.extract_entity_info(entity_type, entity)
         entity_version = normalized_entity['version']
     new_item = self.transform_entity_to_cart_item(real_cart_id,
                                                   entity_type, entity_id,
                                                   entity_version)
     self.dynamo_accessor.insert_item(config.dynamo_cart_item_table_name,
                                      new_item)
     return new_item['CartItemId']
Example #3
0
    def __init__(self, catalog: Optional[CatalogName],
                 field_types: CataloguedFieldTypes, refresh: Union[bool, str],
                 conflict_retry_limit: int, error_retry_limit: int) -> None:
        """
        :param field_types: A mapping of field paths to field type

        :param refresh: https://www.elastic.co/guide/en/elasticsearch/reference/5.5/docs-refresh.html

        :param conflict_retry_limit: The maximum number of retries (the second
                                     attempt is the first retry) on version
                                     conflicts. Specify 0 for no retries or None
                                     for unlimited retries.

        :param error_retry_limit: The maximum number of retries (the second
                                  attempt is the first retry) on other errors.
                                  Specify 0 for no retries or None for
                                  unlimited retries.
        """
        super().__init__()
        self.catalog = catalog
        self.field_types = field_types
        self.refresh = refresh
        self.conflict_retry_limit = conflict_retry_limit
        self.error_retry_limit = error_retry_limit
        self.es_client = ESClientFactory.get()
        self.errors: MutableMapping[DocumentCoordinates,
                                    int] = defaultdict(int)
        self.conflicts: MutableMapping[DocumentCoordinates,
                                       int] = defaultdict(int)
        self.retries: Optional[MutableSet[DocumentCoordinates]] = None
Example #4
0
    def _read_aggregates(
        self, entities: CataloguedTallies
    ) -> Dict[CataloguedEntityReference, Aggregate]:
        coordinates = [
            AggregateCoordinates(entity=entity) for entity in entities
        ]
        request = {
            'docs': [{
                '_type': coordinate.type,
                '_index': coordinate.index_name,
                '_id': coordinate.document_id
            } for coordinate in coordinates]
        }
        catalogs = {coordinate.entity.catalog for coordinate in coordinates}
        mandatory_source_fields = set()
        for catalog in catalogs:
            aggregate_cls = self.aggregate_class(catalog)
            mandatory_source_fields.update(
                aggregate_cls.mandatory_source_fields())
        response = ESClientFactory.get().mget(
            body=request, _source_include=list(mandatory_source_fields))

        def aggregates():
            for doc in response['docs']:
                if doc['found']:
                    coordinate = DocumentCoordinates.from_hit(doc)
                    aggregate_cls = self.aggregate_class(
                        coordinate.entity.catalog)
                    aggregate = aggregate_cls.from_index(
                        self.catalogued_field_types(),
                        doc,
                        coordinates=coordinate)
                    yield aggregate

        return {a.coordinates.entity: a for a in aggregates()}
Example #5
0
 def create_indices(self, catalog: CatalogName):
     es_client = ESClientFactory.get()
     for index_name in self.index_names(catalog):
         while True:
             settings = self.settings(index_name)
             mappings = self.metadata_plugin(catalog).mapping()
             try:
                 with silenced_es_logger():
                     index = es_client.indices.get(index=index_name)
             except NotFoundError:
                 try:
                     es_client.indices.create(index=index_name,
                                              body=dict(settings=settings,
                                                        mappings=mappings))
                 except RequestError as e:
                     if e.error == 'resource_already_exists_exception':
                         log.info(
                             'Another party concurrently created index %r, retrying.',
                             index_name)
                     else:
                         raise
             else:
                 self._check_index(settings=settings,
                                   mappings=mappings,
                                   index=index[index_name])
                 break
Example #6
0
 def create_indices(self, catalog: CatalogName):
     es_client = ESClientFactory.get()
     for index_name in self.index_names(catalog):
         es_client.indices.create(
             index=index_name,
             ignore=[400],
             body=dict(settings=self.settings(index_name),
                       mappings=dict(
                           doc=self.metadata_plugin(catalog).mapping())))
Example #7
0
 def _assert_indices_exist(self, catalog: CatalogName):
     """
     Aside from checking that all indices exist this method also asserts
     that we can instantiate a local ES client pointing at a real, remote
     ES domain.
     """
     es_client = ESClientFactory.get()
     service = IndexService()
     for index_name in service.index_names(catalog):
         self.assertTrue(es_client.indices.exists(index_name))
Example #8
0
 def setUpClass(cls):
     super().setUpClass()
     es_endpoint = cls._create_container('docker.elastic.co/elasticsearch/elasticsearch:6.8.0',
                                         container_port=9200,
                                         environment=['xpack.security.enabled=false',
                                                      'discovery.type=single-node',
                                                      'ES_JAVA_OPTS=-Xms512m -Xmx512m'])
     try:
         new_env = config.es_endpoint_env(es_endpoint=es_endpoint, es_instance_count=2)
         cls._env_patch = mock.patch.dict(os.environ, **new_env)
         cls._env_patch.__enter__()
         cls.es_client = ESClientFactory.get()
         cls._wait_for_es()
     except BaseException:  # no coverage
         cls._kill_containers()
         raise
Example #9
0
 def deindex(self, catalog: CatalogName, sources: Iterable[str]):
     plugin = self.repository_plugin(catalog)
     source_ids = [plugin.resolve_source(s).id for s in sources]
     es_client = ESClientFactory.get()
     indices = ','.join(self.index_service.index_names(catalog))
     query = {
         'query': {
             'bool': {
                 'should': [
                     {
                         'terms': {
                             # Aggregate documents
                             'sources.id.keyword': source_ids
                         }
                     },
                     {
                         'terms': {
                             # Contribution documents
                             'source.id.keyword': source_ids
                         }
                     }
                 ]
             }
         }
     }
     logger.info('Deindexing sources %r from catalog %r', sources, catalog)
     logger.debug('Using query: %r', query)
     response = es_client.delete_by_query(index=indices,
                                          body=query,
                                          slices='auto')
     if len(response['failures']) > 0:
         if response['version_conflicts'] > 0:
             logger.error(
                 'Version conflicts encountered. Do not deindex while '
                 'indexing is occurring. The index may now be in an '
                 'inconsistent state.')
         raise RuntimeError('Failures during deletion',
                            response['failures'])
Example #10
0
 def setUpClass(cls):
     super().setUpClass()
     cls.es_client = ESClientFactory.get()
Example #11
0
 def es_client(self) -> Elasticsearch:
     return ESClientFactory.get()
Example #12
0
 def _read_contributions(
         self, tallies: CataloguedTallies) -> List[CataloguedContribution]:
     es_client = ESClientFactory.get()
     entity_ids_by_index: MutableMapping[str,
                                         MutableSet[str]] = defaultdict(set)
     for entity in tallies.keys():
         index = config.es_index_name(catalog=entity.catalog,
                                      entity_type=entity.entity_type,
                                      aggregate=False)
         entity_ids_by_index[index].add(entity.entity_id)
     query = {
         "query": {
             "bool": {
                 "should": [{
                     "bool": {
                         "must": [{
                             "term": {
                                 "_index": index
                             }
                         }, {
                             "terms": {
                                 "entity_id.keyword": list(entity_ids)
                             }
                         }]
                     }
                 } for index, entity_ids in entity_ids_by_index.items()]
             }
         }
     }
     index = sorted(list(entity_ids_by_index.keys()))
     # scan() uses a server-side cursor and is expensive. Only use it if the number of contributions is large
     page_size = 1000  # page size of 100 caused excessive ScanError occurrences
     num_contributions = sum(tallies.values())
     hits = None
     if num_contributions <= page_size:
         log.info('Reading %i expected contribution(s) using search().',
                  num_contributions)
         response = es_client.search(index=index,
                                     body=query,
                                     size=page_size,
                                     doc_type=Document.type)
         total_hits = response['hits']['total']
         if total_hits <= page_size:
             hits = response['hits']['hits']
             if len(hits) != total_hits:
                 message = f'Search returned {len(hits)} hits but reports total to be {total_hits}'
                 raise EventualConsistencyException(message)
         else:
             log.info('Expected only %i contribution(s) but got %i.',
                      num_contributions, total_hits)
             num_contributions = total_hits
     if hits is None:
         log.info('Reading %i expected contribution(s) using scan().',
                  num_contributions)
         hits = scan(es_client,
                     index=index,
                     query=query,
                     size=page_size,
                     doc_type=Document.type)
     contributions = [
         Contribution.from_index(self.catalogued_field_types(), hit)
         for hit in hits
     ]
     log.info('Read %i contribution(s). ', len(contributions))
     if log.isEnabledFor(logging.DEBUG):
         entity_ref = attrgetter('entity')
         log.debug(
             'Number of contributions read, by entity: %r', {
                 f'{entity.entity_type}/{entity.entity_id}': sum(
                     1 for _ in contribution_group)
                 for entity, contribution_group in groupby(
                     sorted(contributions, key=entity_ref), key=entity_ref)
             })
     return contributions
Example #13
0
 def delete_indices(self, catalog: CatalogName):
     es_client = ESClientFactory.get()
     for index_name in self.index_names(catalog):
         if es_client.indices.exists(index_name):
             es_client.indices.delete(index=index_name)
Example #14
0
    def _read_contributions(
            self, tallies: CataloguedTallies) -> List[CataloguedContribution]:
        es_client = ESClientFactory.get()

        entity_ids_by_index: MutableMapping[str,
                                            MutableSet[str]] = defaultdict(set)
        for entity in tallies.keys():
            index = config.es_index_name(catalog=entity.catalog,
                                         entity_type=entity.entity_type,
                                         aggregate=False)
            entity_ids_by_index[index].add(entity.entity_id)

        query = {
            'bool': {
                'should': [{
                    'bool': {
                        'must': [{
                            'term': {
                                '_index': index
                            }
                        }, {
                            'terms': {
                                'entity_id.keyword': list(entity_ids)
                            }
                        }]
                    }
                } for index, entity_ids in entity_ids_by_index.items()]
            }
        }

        index = sorted(list(entity_ids_by_index.keys()))
        num_contributions = sum(tallies.values())
        log.info('Reading %i expected contribution(s)', num_contributions)

        def pages() -> Iterable[JSONs]:
            body = dict(query=query)
            while True:
                response = es_client.search(
                    index=index,
                    sort=['_index', 'document_id.keyword'],
                    body=body,
                    size=config.contribution_page_size,
                    track_total_hits=False,
                    seq_no_primary_term=Contribution.needs_seq_no_primary_term)
                hits = response['hits']['hits']
                log.debug('Read a page with %i contribution(s)', len(hits))
                if hits:
                    yield hits
                    body['search_after'] = hits[-1]['sort']
                else:
                    break

        contributions = [
            Contribution.from_index(self.catalogued_field_types(), hit)
            for hits in pages() for hit in hits
        ]

        log.info('Read %i contribution(s)', len(contributions))
        if log.isEnabledFor(logging.DEBUG):
            entity_ref = attrgetter('entity')
            log.debug(
                'Number of contributions read, by entity: %r', {
                    f'{entity.entity_type}/{entity.entity_id}': sum(
                        1 for _ in contribution_group)
                    for entity, contribution_group in groupby(
                        sorted(contributions, key=entity_ref), key=entity_ref)
                })
        return contributions