def simple_search_public_data(query_text): result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]} index_list = ['experiments', 'dataset', 'datafile'] ms = MultiSearch(index=index_list) query_exp = Q("match", title=query_text) query_exp_oacl = Q("term", public_access=100) query_exp = query_exp & query_exp_oacl ms = ms.add(Search(index='experiments') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE) .query(query_exp)) query_dataset = Q("match", description=query_text) query_dataset_oacl = Q("term", **{'experiments.public_access': 100}) ms = ms.add(Search(index='dataset') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset) .query('nested', path='experiments', query=query_dataset_oacl)) query_datafile = Q("match", filename=query_text) query_datafile_oacl = Q("term", experiments__public_access=100) query_datafile = query_datafile & query_datafile_oacl ms = ms.add(Search(index='datafile') .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE) .query(query_datafile)) results = ms.execute() for item in results: for hit in item.hits.hits: if hit["_index"] == "dataset": result_dict["datasets"].append(hit.to_dict()) elif hit["_index"] == "experiments": result_dict["experiments"].append(hit.to_dict()) elif hit["_index"] == "datafile": result_dict["datafiles"].append(hit.to_dict()) return result_dict
def search_request(request): if request.method == 'GET': if q := request.GET.get('q', None): object_list = [] search = Search( index=['songs', 'artists', 'albums', 'labels', 'genres']) objects = search.from_dict({ "query": { "dis_max": { "queries": [{ "multi_match": { "query": q, "type": "phrase", "fields": [ "title^20", "lyrics^10", "name^100", "description^50" ] } }, { "multi_match": { "query": q, "fuzziness": "AUTO", "fields": [ "title^2", "lyrics", "name^10", "description^5" ] } }] } } }) for obj in objects: print(obj) row = { 'id': obj.meta.id, 'score': obj.meta.score, 'url': 'main:' + obj.meta.index[:-1] + '-detail', 'model': obj.meta.index, } if obj.meta.index == 'songs': row['text'] = f'Song: {obj.title}' elif obj.meta.index == 'artists': row['text'] = f'Artist: {obj.name}' elif obj.meta.index == 'labels': row['text'] = f'Label: {obj.name}' elif obj.meta.index == 'genres': row['text'] = f'Genre: {obj.name}' elif obj.meta.index == 'albums': row['text'] = f'Album: {obj.title}' object_list.append(row) return render(request=request, template_name="main/search.html", context={'object_list': object_list})
def get_object_list(self, request): user = request.user query_text = request.GET.get('query', None) if not user.is_authenticated: result_dict = simple_search_public_data(query_text) return [SearchObject(id=1, hits=result_dict)] groups = user.groups.all() index_list = ['experiments', 'dataset', 'datafile'] ms = MultiSearch(index=index_list) query_exp = Q("match", title=query_text) query_exp_oacl = Q("term", objectacls__entityId=user.id) | \ Q("term", public_access=100) for group in groups: query_exp_oacl = query_exp_oacl | \ Q("term", objectacls__entityId=group.id) query_exp = query_exp & query_exp_oacl ms = ms.add( Search(index='experiments').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_exp)) query_dataset = Q("match", description=query_text) query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \ Q("term", **{'experiments.public_access': 100}) for group in groups: query_dataset_oacl = query_dataset_oacl | \ Q("term", **{'experiments.objectacls.entityId': group.id}) ms = ms.add( Search(index='dataset').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset).query( 'nested', path='experiments', query=query_dataset_oacl)) query_datafile = Q("match", filename=query_text) query_datafile_oacl = Q("term", experiments__objectacls__entityId=user.id) | \ Q("term", experiments__public_access=100) for group in groups: query_datafile_oacl = query_datafile_oacl | \ Q("term", experiments__objectacls__entityId=group.id) query_datafile = query_datafile & query_datafile_oacl ms = ms.add( Search(index='datafile').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_datafile)) results = ms.execute() result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]} for item in results: for hit in item.hits.hits: if hit["_index"] == "dataset": result_dict["datasets"].append(hit.to_dict()) elif hit["_index"] == "experiments": result_dict["experiments"].append(hit.to_dict()) elif hit["_index"] == "datafile": result_dict["datafiles"].append(hit.to_dict()) return [SearchObject(id=1, hits=result_dict)]
def filter(self, qs, value): client = Elasticsearch([settings.ELASTICSEARCH_HOST]) value = value.lower() search_query = { "bool": { "must_not": [ # исключает из выдачи is_published=False { "term": { "is_published": False } } ], "should": [ { "simple_query_string": { "fields": ["category_name"], "quote_field_suffix": ".exact", "query": value } }, ] } } s = Search(using=client, index='category') \ .query(search_query)\ .sort("_score", "-views")\ .extra(size=self.max_result, from_=0) hits_list = [] items = s.execute() if items: for item in items: hits_list.append(item.meta.id) hits_order = Case( *[When(pk=pk, then=pos) for pos, pk in enumerate(hits_list)]) qs = qs.filter(id__in=hits_list).order_by(hits_order) else: qs = qs.none() # TODO: fallback? # bits = value.split(' ') # search_clauses = reduce(operator.and_, # [Q(title__icontains=v) for v in bits]) # unpublished = Category.objects.get_queryset_descendants( # Category.objects.filter(is_published=False), include_self=True) # qs = (qs # .exclude(pk__in=unpublished) # .filter(search_clauses) # .order_by('-views')) return qs[:self.max_result]
def _do_check(self): """ Performs a basic check on the database by performing a select query on a simple table then performs a basic check on ElasticSearch by performing a search without exceptions occuring :return: False according to results of check, True if successful False if there is a fail """ try: # Perform database check HealthCheck.objects.get(health_check_field=True) # Perform Elaseticsearch check client = Elasticsearch(hosts=[settings.ES_URL]) query_object = { "multi_match": { "query": "a_commodity_or_code", "type": "most_fields", "fields": ["keywords", "description"], "operator": "and" if "," not in "a_commodity_or_code" else "or", } } Search().index("indexes").using(client).query(query_object).sort( "sort_object") # Return success if we have reached this point return True except Exception as e: capture_exception(e) return False
def highlight(self, search: Search) -> Search: # TODO: Why did we have this? # search = search.highlight_options(require_field_match=False) search = search.highlight("*", fragment_size=150, pre_tags="<mark>", post_tags="</mark>") return search
def autocomplete_search(q, doc_type=None, fuzzy_mode=False, **kwargs): query = autocomplete_query(q, fuzzy_mode) limit = kwargs.get('limit', 20) offset = kwargs.get('offset', 0) filters = kwargs.get('filters', {}) if limit and limit > 100: limit = 100 s = Search().index('_all') if doc_type: s = s.doc_type(doc_type) s = s.query('match', autocomplete=query) # TODO: implement in a generic way # add filters like: `&filter_status=Ready&filter_type=Broadcasts` for key, value in filters.iteritems(): s = s.query('term', **{key: value[0]}) s = s[offset:limit + offset] return format_search_results(s.execute())
def autocomplete(query: str) -> Response: """ https://www.elastic.co/guide/en/elasticsearch/guide/current/_index_time_search_as_you_type.html We use the ngram-based autocomplete-analyzer for indexing, but the standard analyzer for searching This way we enforce that the whole entered word has to be matched (save for some fuzziness) and the algorithm does not fall back to matching only the first character in extreme cases. This prevents absurd cases where "Garret Walker" and "Hector Mendoza" are suggested when we're entering "Mahatma Ghandi" """ search_query = Search(index=list(DOCUMENT_INDICES.values())) search_query = search_query.query( "match", autocomplete={ "query": escape_elasticsearch_query(query), "analyzer": "standard", "fuzziness": "AUTO", "prefix_length": 1, }, ) search_query = search_query.extra(min_score=1) search_query = search_query.update_from_dict({ "indices_boost": [ { DOCUMENT_INDICES["person"]: 4 }, { DOCUMENT_INDICES["organization"]: 4 }, { DOCUMENT_INDICES["paper"]: 2 }, ] }) response = search_query.execute() return response
def search_by_code(code): processed_query = process_commodity_code(code) query_object = {"term": {"commodity_code": processed_query}} client = Elasticsearch(hosts=[settings.ES_URL]) hits = Search().index(*alias_names).using(client).query(query_object) for hit in hits: try: hit["hierarchy_context"] = json.loads(hit["hierarchy_context"]) except KeyError as exception: logger.info("{0} {1}".format(hit["commodity_code"], exception.args)) return hits
def _add_date_before(search: Search, params: Dict[str, Any], options, errors) -> Search: """Filters by a date given a string, catching parsing errors.""" try: before = parse(params["before"]) except (ValueError, OverflowError) as e: errors.append( gettext( f"The value for before is invalid. The correct format is 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS': {e}" )) return search search = search.filter( Q("range", start={"lte": before}) | Q("range", legal_date={"lte": before})) options["before"] = before return search
def _build_search_request(query, sort_key, sort_order, filter_on_leaf=None): client = Elasticsearch(hosts=[settings.ES_URL]) sort_object = {sort_key: sort_order} query_object = { "multi_match": { "query": query, "type": "most_fields", "fields": ["keywords", "description"], "operator": "and" if "," not in query else "or", } } request = (Search().index( *alias_names).using(client).query(query_object).sort(sort_object)) if filter_on_leaf: request = request.filter("term", leaf=filter_on_leaf) return request
def query(self, search: Search, query: str) -> Search: if query: self.options["searchterm"] = query # Fuzzines AUTO(=2) gives more error tolerance, but is also a lot slower and has many false positives # We're using https://stackoverflow.com/a/35375562/3549270 to make exact matches score higher than fuzzy # matches search = search.query( Bool(should=[ MultiMatch( query=escape_elasticsearch_query(query), operator="and", fields=self.fields, ), MultiMatch( query=escape_elasticsearch_query(query), operator="and", fields=self.fields, fuzziness="1", prefix_length=1, ), ])) return search
def filter(self, qs, value): # инициализируем подключение client = Elasticsearch([settings.ELASTICSEARCH_HOST]) value = value.lower() # формируем запрос search_query = { "bool": { "must_not": [ # исключает из выдачи is_published=False { "term": { "is_published": False } } ], "should": [ { "simple_query_string": { # ищем что-то разумное "fields": ["fullname", "category_name"], "quote_field_suffix": ".exact", "query": value } }, { # частичное вхождение по строкам с транслитом (англ->рус) # constant_score запрещает буст по частоте вхождения "constant_score": { "filter": { "match": { "fullname_translit": { "query": value, "fuzziness": 1, "operator": "and", } } } } }, ] } } # Инициализация запроса s = Search(using=client, index='offer') \ .query(search_query)\ .sort("_score", "-views")\ .extra(size=self.max_result, from_=0) self.hits_list = [] items = s.execute() if items: for item in items: self.hits_list.append(item.meta.id) # нужно для того, чтобы у выборки из пусгреса сохранился порядок, который вернул эластик self.hits_order = Case(*[ When(pk=pk, then=pos) for pos, pk in enumerate(self.hits_list) ]) qs = qs.filter(id__in=self.hits_list).order_by(self.hits_order) else: qs = qs.none() # TODO: старая реализация. Может, оставить, как fallback? # else: # qs = qs.annotate(full_name=Concat( # 'product__article', Value(' '), # 'product__title', Value(' '), # 'product__search_title')) # bits = value.split(' ') # if len(bits) is 1 and bits[0].isdecimal(): # full_name_clauses = Q(full_name__icontains=bits[0]) # else: # full_name_clauses = reduce( # operator.and_, # [Q(full_name__iregex=r'(^|\s)%s' % escape(v)) # for v in bits]) # # unpublished = Category.objects.get_queryset_descendants( # Category.objects.filter(is_published=False), # include_self=True) # # qs = (qs.filter(full_name_clauses) # .exclude(product__category__in=unpublished)) # # if self.uniq_category: # products = (qs.order_by('product__category__title') # .distinct('product__category__title')) # qs = (qs.filter(id__in=products) # .order_by('-product__category__views')) return qs
class DrugsListView(APIView): client = Elasticsearch(hosts=[{"host": "elasticsearch", "port": 9200}]) search = Search(index='drugs').using(client).sort('trade_name.raw') @swagger_auto_schema(manual_parameters=[PAGE_FIELD, QUERY_FIELD]) def get(self, request): page = int(request.GET.get('page', 1)) page_size = settings.REST_FRAMEWORK.get('PAGE_SIZE') query_word = request.GET.get('query', None) if not query_word: s = self.search.query("match_all")[page - 1:page - 1 + page_size] res = s.execute().to_dict()['hits']['hits'] return Response(data=res, status=status.HTTP_200_OK) query_word = query_word.lower() + "*" query = { "dis_max": { "queries": [ { "wildcard": { "trade_name": { "value": query_word, "boost": 3.0 } } }, { "wildcard": { "international_name.name": { "value": query_word, "boost": 3.0 } } }, { "wildcard": { "formula": { "value": query_word, "boost": 2.0 } } }, { "wildcard": { "registration number": { "value": query_word, "boost": 1.0 } } }, { "wildcard": { "INN.name": { "value": query_word, "boost": 0.5 } } }, { "nested": { "path": "atcs", "query": { "wildcard": { "atcs.name": { "value": query_word, "boost": 0.5 } } } } }, ], } } s = self.search.query(query)[page - 1:page - 1 + page_size] res = s.execute().to_dict()['hits']['hits'] return Response(data=res, status=status.HTTP_200_OK) @swagger_auto_schema(request_body=DrugSerializer) def post(self, request): serializer = DrugSerializer(data=request.data) if serializer.is_valid(): serializer.create(serializer.validated_data) DrugDocument(serializer.validated_data).save(using=self.client) return Response(data=serializer.validated_data, status=status.HTTP_201_CREATED) return Response(data=serializer.errors, status=status.HTTP_400_BAD_REQUEST)
def obj_create(self, bundle, **kwargs): user = bundle.request.user groups = user.groups.all() # if anonymous user search public data only query_text = bundle.data.get("text", None) type_tag = bundle.data.get("TypeTag", []) index_list = [] for type in type_tag: if type == 'Experiment': index_list.append('experiments') elif type == 'Dataset': index_list.append('dataset') elif type == 'Datafile': index_list.append('datafile') end_date = bundle.data.get("EndDate", None) start_date = bundle.data.get("StartDate", None) if end_date is not None: end_date_utc = datetime.datetime.strptime(end_date, "%Y-%m-%dT%H:%M:%S.%fZ") \ .replace(tzinfo=pytz.timezone('UTC')) end_date = end_date_utc.astimezone(LOCAL_TZ).date() else: # set end date to today's date end_date = datetime.datetime.today().replace( tzinfo=pytz.timezone('UTC')) if start_date: start_date_utc = datetime.datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%S.%fZ") \ .replace(tzinfo=pytz.timezone('UTC')) start_date = start_date_utc.astimezone(LOCAL_TZ).date() instrument_list = bundle.data.get("InstrumentList", None) instrument_list_id = [] if instrument_list: for ins in instrument_list: instrument_list_id.append( Instrument.objects.get(name__exact=ins).id) # query for experiment model ms = MultiSearch(index=index_list) if 'experiments' in index_list: query_exp = Q("match", title=query_text) if user.is_authenticated: query_exp_oacl = Q("term", objectacls__entityId=user.id) | \ Q("term", public_access=100) for group in groups: query_exp_oacl = query_exp_oacl | \ Q("term", objectacls__entityId=group.id) else: query_exp_oacl = Q("term", public_access=100) if start_date is not None: query_exp = query_exp & Q("range", created_time={ 'gte': start_date, 'lte': end_date }) query_exp = query_exp & query_exp_oacl ms = ms.add( Search(index='experiments').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_exp)) if 'dataset' in index_list: query_dataset = Q("match", description=query_text) if user.is_authenticated: query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \ Q("term", **{'experiments.public_access': 100}) for group in groups: query_dataset_oacl = query_dataset_oacl | \ Q("term", **{'experiments.objectacls.entityId': group.id}) else: query_dataset_oacl = Q("term", **{'experiments.public_access': 100}) if start_date is not None: query_dataset = query_dataset & Q("range", created_time={ 'gte': start_date, 'lte': end_date }) if instrument_list: query_dataset = query_dataset & Q( "terms", **{'instrument.id': instrument_list_id}) # add instrument query ms = ms.add( Search(index='dataset').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset).query( 'nested', path='experiments', query=query_dataset_oacl)) if 'datafile' in index_list: query_datafile = Q("match", filename=query_text) if user.is_authenticated: query_datafile_oacl = Q("term", **{'dataset.experiments.objectacls.entityId': user.id}) | \ Q("term", **{'dataset.experiments.public_access': 100}) for group in groups: query_datafile_oacl = query_datafile_oacl | \ Q("term", **{'dataset.experiments.objectacls.entityId': group.id}) else: query_datafile_oacl = Q( "term", **{'dataset.experiments.public_access': 100}) if start_date is not None: query_datafile = query_datafile & Q("range", created_time={ 'gte': start_date, 'lte': end_date }) ms = ms.add( Search(index='datafile').extra( size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_datafile).query( 'nested', path='dataset.experiments', query=query_datafile_oacl)) result = ms.execute() result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]} for item in result: for hit in item.hits.hits: if hit["_index"] == "dataset": result_dict["datasets"].append(hit) elif hit["_index"] == "experiments": result_dict["experiments"].append(hit) elif hit["_index"] == "datafile": result_dict["datafiles"].append(hit) if bundle.request.method == 'POST': bundle.obj = SearchObject(id=1, hits=result_dict) return bundle