def add_cart_item(self, catalog: CatalogName, user_id, cart_id, entity_id, entity_type, entity_version): """ Add an item to a cart and return the created item ID An error will be raised if the cart does not exist or does not belong to the user """ # TODO: Cart item should have some user readable name if cart_id is None: cart = self.get_or_create_default_cart(user_id) else: cart = self.get_cart(user_id, cart_id) real_cart_id = cart['CartId'] if not entity_version: # When entity_version is not given, this method will check the data integrity and retrieve the version. entity = ESClientFactory.get().get( index=config.es_index_name(catalog=catalog, entity_type=entity_type, aggregate=True), id=entity_id, _source=True, _source_include=[ 'contents.files.uuid', # data file UUID 'contents.files.version', # data file version 'contents.projects.document_id', # metadata file UUID 'contents.samples.document_id', # metadata file UUID ])['_source'] normalized_entity = self.extract_entity_info(entity_type, entity) entity_version = normalized_entity['version'] new_item = self.transform_entity_to_cart_item(real_cart_id, entity_type, entity_id, entity_version) self.dynamo_accessor.insert_item(config.dynamo_cart_item_table_name, new_item) return new_item['CartItemId']
def index_names(self, catalog: CatalogName) -> List[str]: return [ config.es_index_name(catalog=catalog, entity_type=entity_type, aggregate=aggregate) for entity_type in self.entity_types(catalog) for aggregate in (False, True) ]
def _create_request(self, catalog: CatalogName, filters: FiltersJSON, post_filter: bool = False, source_filter: SourceFilters = None, enable_aggregation: bool = True, entity_type='files') -> Search: """ This function will create an ElasticSearch request based on the filters and facet_config passed into the function :param filters: The 'filters' parameter. Assumes to be translated into es_key terms :param post_filter: Flag for doing either post_filter or regular querying (i.e. faceting or not) :param List source_filter: A list of "foo.bar" field paths (see https://www.elastic.co/guide/en/elasticsearch/reference/5.5/search-request-source-filtering.html) :param enable_aggregation: Flag for enabling query aggregation (and effectively ignoring facet configuration) :param entity_type: the string referring to the entity type used to get the ElasticSearch index to search :return: Returns the Search object that can be used for executing the request """ service_config = self.service_config(catalog) field_mapping = service_config.translation facet_config = { key: field_mapping[key] for key in service_config.facets } es_search = Search(using=self.es_client, index=config.es_index_name(catalog=catalog, entity_type=entity_type, aggregate=True)) filters = self._translate_filters(catalog, filters, field_mapping) es_query = self._create_query(catalog, filters) if post_filter: es_search = es_search.post_filter(es_query) else: es_search = es_search.query(es_query) if source_filter: es_search = es_search.source(includes=source_filter) elif entity_type not in ("files", "bundles"): es_search = es_search.source(excludes="bundles") if enable_aggregation: for agg, translation in facet_config.items(): # FIXME: Aggregation filters may be redundant when post_filter is false # https://github.com/DataBiosphere/azul/issues/3435 es_search.aggs.bucket( agg, self._create_aggregate(catalog, filters, facet_config, agg)) return es_search
def index_name(self) -> str: """ The fully qualifed name of the Elasticsearch index for a document with these coordinates. Only call this if these coordinates use a catalogued entity reference. You can use `.with_catalog()` to create one. """ assert isinstance(self.entity, CataloguedEntityReference) return config.es_index_name(catalog=self.entity.catalog, entity_type=self.entity.entity_type, aggregate=self.aggregate)
def _load_canned_result(self, bundle_fqid: BundleFQID) -> MutableJSONs: """ Load the canned index documents for the given canned bundle and fix the '_index' entry in each to match the index name in the current deployment """ expected_hits = self._load_canned_file(bundle_fqid, 'results') assert isinstance(expected_hits, list) for hit in expected_hits: index_name = IndexName.parse(hit['_index']) hit['_index'] = config.es_index_name( catalog=self.catalog, entity_type=index_name.entity_type, aggregate=index_name.aggregate) return expected_hits
def _create_autocomplete_request(self, catalog: CatalogName, filters: FiltersJSON, es_client, _query, search_field, entity_type='files'): """ This function will create an ElasticSearch request based on the filters passed to the function. :param catalog: The name of the catalog to create the ES request for. :param filters: The 'filters' parameter from '/keywords'. :param es_client: The ElasticSearch client object used to configure the Search object :param _query: The query (string) to use for querying. :param search_field: The field to do the query on. :param entity_type: the string referring to the entity type used to get the ElasticSearch index to search :return: Returns the Search object that can be used for executing the request """ service_config = self.service_config(catalog) field_mapping = service_config.autocomplete_translation[entity_type] es_search = Search(using=es_client, index=config.es_index_name(catalog=catalog, entity_type=entity_type, aggregate=True)) filters = self._translate_filters(catalog, filters, field_mapping) search_field = field_mapping[ search_field] if search_field in field_mapping else search_field es_filter_query = self._create_query(catalog, filters) es_search = es_search.post_filter(es_filter_query) es_search = es_search.query(Q('prefix', **{str(search_field): _query})) return es_search
def index_name(entity_type): return config.es_index_name(catalog=self.catalog, entity_type=entity_type, aggregate=aggregate)
def transform_request(self, catalog: CatalogName, entity_type: str, filters: Filters, pagination: Optional[Pagination] = None) -> MutableJSON: """ This function does the whole transformation process. It takes the path of the config file, the filters, and pagination, if any. Excluding filters will do a match_all request. Excluding pagination will exclude pagination from the output. :param catalog: The name of the catalog to query :param filters: Filter parameter from the API to be used in the query. Defaults to None :param pagination: Pagination to be used for the API :param post_filter: Flag to indicate whether to do a post_filter call instead of the regular query. :param entity_type: the string referring to the entity type used to get the ElasticSearch index to search :return: Returns the transformed request """ service_config = self.service_config(catalog) translation = service_config.translation inverse_translation = {v: k for k, v in translation.items()} for facet in filters.keys(): if facet not in translation: raise BadArgumentException(f"Unable to filter by undefined facet {facet}.") if pagination is not None: facet = pagination["sort"] if facet not in translation: raise BadArgumentException(f"Unable to sort by undefined facet {facet}.") es_search = self._create_request(catalog=catalog, filters=filters, post_filter=True, entity_type=entity_type) if pagination is None: # It's a single file search self._annotate_aggs_for_translation(es_search) es_response = es_search.execute(ignore_cache=True) self._translate_response_aggs(catalog, es_response) es_response_dict = es_response.to_dict() hits = [hit['_source'] for hit in es_response_dict['hits']['hits']] hits = self.translate_fields(catalog, hits, forward=False) final_response = KeywordSearchResponse(hits, entity_type, catalog) else: # It's a full file search # Translate the sort field if there is any translation available if pagination['sort'] in translation: pagination['sort'] = translation[pagination['sort']] es_search = self._apply_paging(catalog, es_search, pagination) self._annotate_aggs_for_translation(es_search) try: es_response = es_search.execute(ignore_cache=True) except elasticsearch.NotFoundError as e: raise IndexNotFoundError(e.info["error"]["index"]) except elasticsearch.RequestError as e: if one(e.info['error']['root_cause'])['reason'].startswith('No mapping found for'): es_response = self.es_client.count(index=config.es_index_name(catalog=catalog, entity_type=entity_type, aggregate=True)) if es_response['count'] == 0: # Count is zero for empty index final_response = FileSearchResponse(hits={}, pagination={}, facets={}, entity_type=entity_type, catalog=catalog) return final_response.apiResponse.to_json() raise e self._translate_response_aggs(catalog, es_response) es_response_dict = es_response.to_dict() # Extract hits and facets (aggregations) es_hits = es_response_dict['hits']['hits'] # If the number of elements exceed the page size, then we fetched one too many # entries to determine if there is a previous or next page. In that case, # return one fewer hit. list_adjustment = 1 if len(es_hits) > pagination['size'] else 0 if 'search_before' in pagination: hits = reversed(es_hits[0:len(es_hits) - list_adjustment]) else: hits = es_hits[0:len(es_hits) - list_adjustment] hits = [hit['_source'] for hit in hits] hits = self.translate_fields(catalog, hits, forward=False) facets = es_response_dict['aggregations'] if 'aggregations' in es_response_dict else {} pagination['sort'] = inverse_translation[pagination['sort']] paging = self._generate_paging_dict(catalog, filters, es_response_dict, pagination) final_response = FileSearchResponse(hits, paging, facets, entity_type, catalog) final_response = final_response.apiResponse.to_json() return final_response
def _read_contributions( self, tallies: CataloguedTallies) -> List[CataloguedContribution]: es_client = ESClientFactory.get() entity_ids_by_index: MutableMapping[str, MutableSet[str]] = defaultdict(set) for entity in tallies.keys(): index = config.es_index_name(catalog=entity.catalog, entity_type=entity.entity_type, aggregate=False) entity_ids_by_index[index].add(entity.entity_id) query = { "query": { "bool": { "should": [{ "bool": { "must": [{ "term": { "_index": index } }, { "terms": { "entity_id.keyword": list(entity_ids) } }] } } for index, entity_ids in entity_ids_by_index.items()] } } } index = sorted(list(entity_ids_by_index.keys())) # scan() uses a server-side cursor and is expensive. Only use it if the number of contributions is large page_size = 1000 # page size of 100 caused excessive ScanError occurrences num_contributions = sum(tallies.values()) hits = None if num_contributions <= page_size: log.info('Reading %i expected contribution(s) using search().', num_contributions) response = es_client.search(index=index, body=query, size=page_size, doc_type=Document.type) total_hits = response['hits']['total'] if total_hits <= page_size: hits = response['hits']['hits'] if len(hits) != total_hits: message = f'Search returned {len(hits)} hits but reports total to be {total_hits}' raise EventualConsistencyException(message) else: log.info('Expected only %i contribution(s) but got %i.', num_contributions, total_hits) num_contributions = total_hits if hits is None: log.info('Reading %i expected contribution(s) using scan().', num_contributions) hits = scan(es_client, index=index, query=query, size=page_size, doc_type=Document.type) contributions = [ Contribution.from_index(self.catalogued_field_types(), hit) for hit in hits ] log.info('Read %i contribution(s). ', len(contributions)) if log.isEnabledFor(logging.DEBUG): entity_ref = attrgetter('entity') log.debug( 'Number of contributions read, by entity: %r', { f'{entity.entity_type}/{entity.entity_id}': sum( 1 for _ in contribution_group) for entity, contribution_group in groupby( sorted(contributions, key=entity_ref), key=entity_ref) }) return contributions
def _index_name(self): return config.es_index_name(catalog=self.catalog, entity_type='files', aggregate=True)
def _read_contributions( self, tallies: CataloguedTallies) -> List[CataloguedContribution]: es_client = ESClientFactory.get() entity_ids_by_index: MutableMapping[str, MutableSet[str]] = defaultdict(set) for entity in tallies.keys(): index = config.es_index_name(catalog=entity.catalog, entity_type=entity.entity_type, aggregate=False) entity_ids_by_index[index].add(entity.entity_id) query = { 'bool': { 'should': [{ 'bool': { 'must': [{ 'term': { '_index': index } }, { 'terms': { 'entity_id.keyword': list(entity_ids) } }] } } for index, entity_ids in entity_ids_by_index.items()] } } index = sorted(list(entity_ids_by_index.keys())) num_contributions = sum(tallies.values()) log.info('Reading %i expected contribution(s)', num_contributions) def pages() -> Iterable[JSONs]: body = dict(query=query) while True: response = es_client.search( index=index, sort=['_index', 'document_id.keyword'], body=body, size=config.contribution_page_size, track_total_hits=False, seq_no_primary_term=Contribution.needs_seq_no_primary_term) hits = response['hits']['hits'] log.debug('Read a page with %i contribution(s)', len(hits)) if hits: yield hits body['search_after'] = hits[-1]['sort'] else: break contributions = [ Contribution.from_index(self.catalogued_field_types(), hit) for hits in pages() for hit in hits ] log.info('Read %i contribution(s)', len(contributions)) if log.isEnabledFor(logging.DEBUG): entity_ref = attrgetter('entity') log.debug( 'Number of contributions read, by entity: %r', { f'{entity.entity_type}/{entity.entity_id}': sum( 1 for _ in contribution_group) for entity, contribution_group in groupby( sorted(contributions, key=entity_ref), key=entity_ref) }) return contributions