Example #1
0
 def add_cart_item(self, catalog: CatalogName, user_id, cart_id, entity_id,
                   entity_type, entity_version):
     """
     Add an item to a cart and return the created item ID
     An error will be raised if the cart does not exist or does not belong to the user
     """
     # TODO: Cart item should have some user readable name
     if cart_id is None:
         cart = self.get_or_create_default_cart(user_id)
     else:
         cart = self.get_cart(user_id, cart_id)
     real_cart_id = cart['CartId']
     if not entity_version:
         # When entity_version is not given, this method will check the data integrity and retrieve the version.
         entity = ESClientFactory.get().get(
             index=config.es_index_name(catalog=catalog,
                                        entity_type=entity_type,
                                        aggregate=True),
             id=entity_id,
             _source=True,
             _source_include=[
                 'contents.files.uuid',  # data file UUID
                 'contents.files.version',  # data file version
                 'contents.projects.document_id',  # metadata file UUID
                 'contents.samples.document_id',  # metadata file UUID
             ])['_source']
         normalized_entity = self.extract_entity_info(entity_type, entity)
         entity_version = normalized_entity['version']
     new_item = self.transform_entity_to_cart_item(real_cart_id,
                                                   entity_type, entity_id,
                                                   entity_version)
     self.dynamo_accessor.insert_item(config.dynamo_cart_item_table_name,
                                      new_item)
     return new_item['CartItemId']
Example #2
0
 def index_names(self, catalog: CatalogName) -> List[str]:
     return [
         config.es_index_name(catalog=catalog,
                              entity_type=entity_type,
                              aggregate=aggregate)
         for entity_type in self.entity_types(catalog)
         for aggregate in (False, True)
     ]
    def _create_request(self,
                        catalog: CatalogName,
                        filters: FiltersJSON,
                        post_filter: bool = False,
                        source_filter: SourceFilters = None,
                        enable_aggregation: bool = True,
                        entity_type='files') -> Search:
        """
        This function will create an ElasticSearch request based on
        the filters and facet_config passed into the function
        :param filters: The 'filters' parameter.
        Assumes to be translated into es_key terms
        :param post_filter: Flag for doing either post_filter or regular
        querying (i.e. faceting or not)
        :param List source_filter: A list of "foo.bar" field paths (see
               https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-request-source-filtering.html)
        :param enable_aggregation: Flag for enabling query aggregation (and
               effectively ignoring facet configuration)
        :param entity_type: the string referring to the entity type used to get
        the ElasticSearch index to search
        :return: Returns the Search object that can be used for executing
        the request
        """
        service_config = self.service_config(catalog)
        field_mapping = service_config.translation
        facet_config = {
            key: field_mapping[key]
            for key in service_config.facets
        }
        es_search = Search(using=self.es_client,
                           index=config.es_index_name(catalog=catalog,
                                                      entity_type=entity_type,
                                                      aggregate=True))
        filters = self._translate_filters(catalog, filters, field_mapping)

        es_query = self._create_query(catalog, filters)

        if post_filter:
            es_search = es_search.post_filter(es_query)
        else:
            es_search = es_search.query(es_query)

        if source_filter:
            es_search = es_search.source(includes=source_filter)
        elif entity_type not in ("files", "bundles"):
            es_search = es_search.source(excludes="bundles")

        if enable_aggregation:
            for agg, translation in facet_config.items():
                # FIXME: Aggregation filters may be redundant when post_filter is false
                #        https://github.com/DataBiosphere/azul/issues/3435
                es_search.aggs.bucket(
                    agg,
                    self._create_aggregate(catalog, filters, facet_config,
                                           agg))

        return es_search
Example #4
0
 def index_name(self) -> str:
     """
     The fully qualifed name of the Elasticsearch index for a document with
     these coordinates. Only call this if these coordinates use a catalogued
     entity reference. You can use `.with_catalog()` to create one.
     """
     assert isinstance(self.entity, CataloguedEntityReference)
     return config.es_index_name(catalog=self.entity.catalog,
                                 entity_type=self.entity.entity_type,
                                 aggregate=self.aggregate)
Example #5
0
 def _load_canned_result(self, bundle_fqid: BundleFQID) -> MutableJSONs:
     """
     Load the canned index documents for the given canned bundle and fix the
     '_index' entry in each to match the index name in the current deployment
     """
     expected_hits = self._load_canned_file(bundle_fqid, 'results')
     assert isinstance(expected_hits, list)
     for hit in expected_hits:
         index_name = IndexName.parse(hit['_index'])
         hit['_index'] = config.es_index_name(
             catalog=self.catalog,
             entity_type=index_name.entity_type,
             aggregate=index_name.aggregate)
     return expected_hits
    def _create_autocomplete_request(self,
                                     catalog: CatalogName,
                                     filters: FiltersJSON,
                                     es_client,
                                     _query,
                                     search_field,
                                     entity_type='files'):
        """
        This function will create an ElasticSearch request based on
        the filters passed to the function.

        :param catalog: The name of the catalog to create the ES request for.

        :param filters: The 'filters' parameter from '/keywords'.

        :param es_client: The ElasticSearch client object used to configure the
                          Search object

        :param _query: The query (string) to use for querying.

        :param search_field: The field to do the query on.

        :param entity_type: the string referring to the entity type used to get
                            the ElasticSearch index to search

        :return: Returns the Search object that can be used for executing the
                 request
        """
        service_config = self.service_config(catalog)
        field_mapping = service_config.autocomplete_translation[entity_type]
        es_search = Search(using=es_client,
                           index=config.es_index_name(catalog=catalog,
                                                      entity_type=entity_type,
                                                      aggregate=True))
        filters = self._translate_filters(catalog, filters, field_mapping)
        search_field = field_mapping[
            search_field] if search_field in field_mapping else search_field
        es_filter_query = self._create_query(catalog, filters)
        es_search = es_search.post_filter(es_filter_query)
        es_search = es_search.query(Q('prefix', **{str(search_field): _query}))
        return es_search
Example #7
0
 def index_name(entity_type):
     return config.es_index_name(catalog=self.catalog,
                                 entity_type=entity_type,
                                 aggregate=aggregate)
Example #8
0
    def transform_request(self,
                          catalog: CatalogName,
                          entity_type: str,
                          filters: Filters,
                          pagination: Optional[Pagination] = None) -> MutableJSON:
        """
        This function does the whole transformation process. It takes
        the path of the config file, the filters, and
        pagination, if any. Excluding filters will do a match_all request.
        Excluding pagination will exclude pagination
        from the output.
        :param catalog: The name of the catalog to query
        :param filters: Filter parameter from the API to be used in the query.
        Defaults to None
        :param pagination: Pagination to be used for the API
        :param post_filter: Flag to indicate whether to do a post_filter
        call instead of the regular query.
        :param entity_type: the string referring to the entity type used to get
        the ElasticSearch index to search
        :return: Returns the transformed request
        """
        service_config = self.service_config(catalog)
        translation = service_config.translation
        inverse_translation = {v: k for k, v in translation.items()}

        for facet in filters.keys():
            if facet not in translation:
                raise BadArgumentException(f"Unable to filter by undefined facet {facet}.")

        if pagination is not None:
            facet = pagination["sort"]
            if facet not in translation:
                raise BadArgumentException(f"Unable to sort by undefined facet {facet}.")

        es_search = self._create_request(catalog=catalog,
                                         filters=filters,
                                         post_filter=True,
                                         entity_type=entity_type)

        if pagination is None:
            # It's a single file search
            self._annotate_aggs_for_translation(es_search)
            es_response = es_search.execute(ignore_cache=True)
            self._translate_response_aggs(catalog, es_response)
            es_response_dict = es_response.to_dict()
            hits = [hit['_source'] for hit in es_response_dict['hits']['hits']]
            hits = self.translate_fields(catalog, hits, forward=False)
            final_response = KeywordSearchResponse(hits, entity_type, catalog)
        else:
            # It's a full file search
            # Translate the sort field if there is any translation available
            if pagination['sort'] in translation:
                pagination['sort'] = translation[pagination['sort']]
            es_search = self._apply_paging(catalog, es_search, pagination)
            self._annotate_aggs_for_translation(es_search)
            try:
                es_response = es_search.execute(ignore_cache=True)
            except elasticsearch.NotFoundError as e:
                raise IndexNotFoundError(e.info["error"]["index"])
            except elasticsearch.RequestError as e:
                if one(e.info['error']['root_cause'])['reason'].startswith('No mapping found for'):
                    es_response = self.es_client.count(index=config.es_index_name(catalog=catalog,
                                                                                  entity_type=entity_type,
                                                                                  aggregate=True))
                    if es_response['count'] == 0:  # Count is zero for empty index
                        final_response = FileSearchResponse(hits={},
                                                            pagination={},
                                                            facets={},
                                                            entity_type=entity_type,
                                                            catalog=catalog)
                        return final_response.apiResponse.to_json()
                raise e
            self._translate_response_aggs(catalog, es_response)
            es_response_dict = es_response.to_dict()
            # Extract hits and facets (aggregations)
            es_hits = es_response_dict['hits']['hits']
            # If the number of elements exceed the page size, then we fetched one too many
            # entries to determine if there is a previous or next page.  In that case,
            # return one fewer hit.
            list_adjustment = 1 if len(es_hits) > pagination['size'] else 0
            if 'search_before' in pagination:
                hits = reversed(es_hits[0:len(es_hits) - list_adjustment])
            else:
                hits = es_hits[0:len(es_hits) - list_adjustment]
            hits = [hit['_source'] for hit in hits]
            hits = self.translate_fields(catalog, hits, forward=False)

            facets = es_response_dict['aggregations'] if 'aggregations' in es_response_dict else {}
            pagination['sort'] = inverse_translation[pagination['sort']]
            paging = self._generate_paging_dict(catalog, filters, es_response_dict, pagination)
            final_response = FileSearchResponse(hits, paging, facets, entity_type, catalog)

        final_response = final_response.apiResponse.to_json()

        return final_response
Example #9
0
 def _read_contributions(
         self, tallies: CataloguedTallies) -> List[CataloguedContribution]:
     es_client = ESClientFactory.get()
     entity_ids_by_index: MutableMapping[str,
                                         MutableSet[str]] = defaultdict(set)
     for entity in tallies.keys():
         index = config.es_index_name(catalog=entity.catalog,
                                      entity_type=entity.entity_type,
                                      aggregate=False)
         entity_ids_by_index[index].add(entity.entity_id)
     query = {
         "query": {
             "bool": {
                 "should": [{
                     "bool": {
                         "must": [{
                             "term": {
                                 "_index": index
                             }
                         }, {
                             "terms": {
                                 "entity_id.keyword": list(entity_ids)
                             }
                         }]
                     }
                 } for index, entity_ids in entity_ids_by_index.items()]
             }
         }
     }
     index = sorted(list(entity_ids_by_index.keys()))
     # scan() uses a server-side cursor and is expensive. Only use it if the number of contributions is large
     page_size = 1000  # page size of 100 caused excessive ScanError occurrences
     num_contributions = sum(tallies.values())
     hits = None
     if num_contributions <= page_size:
         log.info('Reading %i expected contribution(s) using search().',
                  num_contributions)
         response = es_client.search(index=index,
                                     body=query,
                                     size=page_size,
                                     doc_type=Document.type)
         total_hits = response['hits']['total']
         if total_hits <= page_size:
             hits = response['hits']['hits']
             if len(hits) != total_hits:
                 message = f'Search returned {len(hits)} hits but reports total to be {total_hits}'
                 raise EventualConsistencyException(message)
         else:
             log.info('Expected only %i contribution(s) but got %i.',
                      num_contributions, total_hits)
             num_contributions = total_hits
     if hits is None:
         log.info('Reading %i expected contribution(s) using scan().',
                  num_contributions)
         hits = scan(es_client,
                     index=index,
                     query=query,
                     size=page_size,
                     doc_type=Document.type)
     contributions = [
         Contribution.from_index(self.catalogued_field_types(), hit)
         for hit in hits
     ]
     log.info('Read %i contribution(s). ', len(contributions))
     if log.isEnabledFor(logging.DEBUG):
         entity_ref = attrgetter('entity')
         log.debug(
             'Number of contributions read, by entity: %r', {
                 f'{entity.entity_type}/{entity.entity_id}': sum(
                     1 for _ in contribution_group)
                 for entity, contribution_group in groupby(
                     sorted(contributions, key=entity_ref), key=entity_ref)
             })
     return contributions
Example #10
0
 def _index_name(self):
     return config.es_index_name(catalog=self.catalog,
                                 entity_type='files',
                                 aggregate=True)
Example #11
0
    def _read_contributions(
            self, tallies: CataloguedTallies) -> List[CataloguedContribution]:
        es_client = ESClientFactory.get()

        entity_ids_by_index: MutableMapping[str,
                                            MutableSet[str]] = defaultdict(set)
        for entity in tallies.keys():
            index = config.es_index_name(catalog=entity.catalog,
                                         entity_type=entity.entity_type,
                                         aggregate=False)
            entity_ids_by_index[index].add(entity.entity_id)

        query = {
            'bool': {
                'should': [{
                    'bool': {
                        'must': [{
                            'term': {
                                '_index': index
                            }
                        }, {
                            'terms': {
                                'entity_id.keyword': list(entity_ids)
                            }
                        }]
                    }
                } for index, entity_ids in entity_ids_by_index.items()]
            }
        }

        index = sorted(list(entity_ids_by_index.keys()))
        num_contributions = sum(tallies.values())
        log.info('Reading %i expected contribution(s)', num_contributions)

        def pages() -> Iterable[JSONs]:
            body = dict(query=query)
            while True:
                response = es_client.search(
                    index=index,
                    sort=['_index', 'document_id.keyword'],
                    body=body,
                    size=config.contribution_page_size,
                    track_total_hits=False,
                    seq_no_primary_term=Contribution.needs_seq_no_primary_term)
                hits = response['hits']['hits']
                log.debug('Read a page with %i contribution(s)', len(hits))
                if hits:
                    yield hits
                    body['search_after'] = hits[-1]['sort']
                else:
                    break

        contributions = [
            Contribution.from_index(self.catalogued_field_types(), hit)
            for hits in pages() for hit in hits
        ]

        log.info('Read %i contribution(s)', len(contributions))
        if log.isEnabledFor(logging.DEBUG):
            entity_ref = attrgetter('entity')
            log.debug(
                'Number of contributions read, by entity: %r', {
                    f'{entity.entity_type}/{entity.entity_id}': sum(
                        1 for _ in contribution_group)
                    for entity, contribution_group in groupby(
                        sorted(contributions, key=entity_ref), key=entity_ref)
                })
        return contributions