Beispiel #1
0
def simple_search_public_data(query_text):
    result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]}
    index_list = ['experiments', 'dataset', 'datafile']
    ms = MultiSearch(index=index_list)
    query_exp = Q("match", title=query_text)
    query_exp_oacl = Q("term", public_access=100)
    query_exp = query_exp & query_exp_oacl
    ms = ms.add(Search(index='experiments')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE)
                .query(query_exp))
    query_dataset = Q("match", description=query_text)
    query_dataset_oacl = Q("term", **{'experiments.public_access': 100})
    ms = ms.add(Search(index='dataset')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE).query(query_dataset)
                .query('nested', path='experiments', query=query_dataset_oacl))
    query_datafile = Q("match", filename=query_text)
    query_datafile_oacl = Q("term", experiments__public_access=100)
    query_datafile = query_datafile & query_datafile_oacl
    ms = ms.add(Search(index='datafile')
                .extra(size=MAX_SEARCH_RESULTS, min_score=MIN_CUTOFF_SCORE)
                .query(query_datafile))
    results = ms.execute()
    for item in results:
        for hit in item.hits.hits:
            if hit["_index"] == "dataset":
                result_dict["datasets"].append(hit.to_dict())

            elif hit["_index"] == "experiments":
                result_dict["experiments"].append(hit.to_dict())

            elif hit["_index"] == "datafile":
                result_dict["datafiles"].append(hit.to_dict())
    return result_dict
Beispiel #2
0
def search_request(request):
    if request.method == 'GET':
        if q := request.GET.get('q', None):
            object_list = []
            search = Search(
                index=['songs', 'artists', 'albums', 'labels', 'genres'])
            objects = search.from_dict({
                "query": {
                    "dis_max": {
                        "queries": [{
                            "multi_match": {
                                "query":
                                q,
                                "type":
                                "phrase",
                                "fields": [
                                    "title^20", "lyrics^10", "name^100",
                                    "description^50"
                                ]
                            }
                        }, {
                            "multi_match": {
                                "query":
                                q,
                                "fuzziness":
                                "AUTO",
                                "fields": [
                                    "title^2", "lyrics", "name^10",
                                    "description^5"
                                ]
                            }
                        }]
                    }
                }
            })

            for obj in objects:
                print(obj)

                row = {
                    'id': obj.meta.id,
                    'score': obj.meta.score,
                    'url': 'main:' + obj.meta.index[:-1] + '-detail',
                    'model': obj.meta.index,
                }
                if obj.meta.index == 'songs':
                    row['text'] = f'Song: {obj.title}'
                elif obj.meta.index == 'artists':
                    row['text'] = f'Artist: {obj.name}'
                elif obj.meta.index == 'labels':
                    row['text'] = f'Label: {obj.name}'
                elif obj.meta.index == 'genres':
                    row['text'] = f'Genre: {obj.name}'
                elif obj.meta.index == 'albums':
                    row['text'] = f'Album: {obj.title}'
                object_list.append(row)
            return render(request=request,
                          template_name="main/search.html",
                          context={'object_list': object_list})
Beispiel #3
0
    def get_object_list(self, request):
        user = request.user
        query_text = request.GET.get('query', None)
        if not user.is_authenticated:
            result_dict = simple_search_public_data(query_text)
            return [SearchObject(id=1, hits=result_dict)]
        groups = user.groups.all()
        index_list = ['experiments', 'dataset', 'datafile']
        ms = MultiSearch(index=index_list)

        query_exp = Q("match", title=query_text)
        query_exp_oacl = Q("term", objectacls__entityId=user.id) | \
            Q("term", public_access=100)
        for group in groups:
            query_exp_oacl = query_exp_oacl | \
                                 Q("term", objectacls__entityId=group.id)
        query_exp = query_exp & query_exp_oacl
        ms = ms.add(
            Search(index='experiments').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_exp))

        query_dataset = Q("match", description=query_text)
        query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \
            Q("term", **{'experiments.public_access': 100})
        for group in groups:
            query_dataset_oacl = query_dataset_oacl | \
                                 Q("term", **{'experiments.objectacls.entityId': group.id})
        ms = ms.add(
            Search(index='dataset').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_dataset).query(
                    'nested', path='experiments', query=query_dataset_oacl))

        query_datafile = Q("match", filename=query_text)
        query_datafile_oacl = Q("term", experiments__objectacls__entityId=user.id) | \
            Q("term", experiments__public_access=100)
        for group in groups:
            query_datafile_oacl = query_datafile_oacl | \
                                 Q("term", experiments__objectacls__entityId=group.id)
        query_datafile = query_datafile & query_datafile_oacl
        ms = ms.add(
            Search(index='datafile').extra(
                size=MAX_SEARCH_RESULTS,
                min_score=MIN_CUTOFF_SCORE).query(query_datafile))
        results = ms.execute()
        result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]}
        for item in results:
            for hit in item.hits.hits:
                if hit["_index"] == "dataset":
                    result_dict["datasets"].append(hit.to_dict())

                elif hit["_index"] == "experiments":
                    result_dict["experiments"].append(hit.to_dict())

                elif hit["_index"] == "datafile":
                    result_dict["datafiles"].append(hit.to_dict())

        return [SearchObject(id=1, hits=result_dict)]
Beispiel #4
0
    def filter(self, qs, value):
        client = Elasticsearch([settings.ELASTICSEARCH_HOST])
        value = value.lower()

        search_query = {
            "bool": {
                "must_not": [  # исключает из выдачи is_published=False
                    {
                        "term": {
                            "is_published": False
                        }
                    }
                ],
                "should": [
                    {
                        "simple_query_string": {
                            "fields": ["category_name"],
                            "quote_field_suffix": ".exact",
                            "query": value
                        }
                    },
                ]
            }
        }

        s = Search(using=client, index='category') \
            .query(search_query)\
            .sort("_score", "-views")\
            .extra(size=self.max_result, from_=0)

        hits_list = []
        items = s.execute()
        if items:
            for item in items:
                hits_list.append(item.meta.id)
            hits_order = Case(
                *[When(pk=pk, then=pos) for pos, pk in enumerate(hits_list)])
            qs = qs.filter(id__in=hits_list).order_by(hits_order)
        else:
            qs = qs.none()

            # TODO: fallback?
            # bits = value.split(' ')
            # search_clauses = reduce(operator.and_,
            #                         [Q(title__icontains=v) for v in bits])
            # unpublished = Category.objects.get_queryset_descendants(
            #     Category.objects.filter(is_published=False), include_self=True)
            # qs = (qs
            #       .exclude(pk__in=unpublished)
            #       .filter(search_clauses)
            #       .order_by('-views'))
        return qs[:self.max_result]
Beispiel #5
0
    def _do_check(self):
        """
        Performs a basic check on the database by performing a select query on a simple table then
        performs a basic check on ElasticSearch by performing a search without exceptions occuring
        :return: False according to results of check, True if successful False if there is a fail
        """
        try:
            # Perform database check
            HealthCheck.objects.get(health_check_field=True)

            # Perform Elaseticsearch check
            client = Elasticsearch(hosts=[settings.ES_URL])
            query_object = {
                "multi_match": {
                    "query":
                    "a_commodity_or_code",
                    "type":
                    "most_fields",
                    "fields": ["keywords", "description"],
                    "operator":
                    "and" if "," not in "a_commodity_or_code" else "or",
                }
            }
            Search().index("indexes").using(client).query(query_object).sort(
                "sort_object")

            # Return success if we have reached this point
            return True

        except Exception as e:
            capture_exception(e)
            return False
Beispiel #6
0
 def highlight(self, search: Search) -> Search:
     # TODO: Why did we have this?
     # search = search.highlight_options(require_field_match=False)
     search = search.highlight("*",
                               fragment_size=150,
                               pre_tags="<mark>",
                               post_tags="</mark>")
     return search
Beispiel #7
0
def autocomplete_search(q, doc_type=None, fuzzy_mode=False, **kwargs):

    query = autocomplete_query(q, fuzzy_mode)
    limit = kwargs.get('limit', 20)
    offset = kwargs.get('offset', 0)
    filters = kwargs.get('filters', {})

    if limit and limit > 100:
        limit = 100

    s = Search().index('_all')

    if doc_type:
        s = s.doc_type(doc_type)

    s = s.query('match', autocomplete=query)

    # TODO: implement in a generic way
    # add filters like: `&filter_status=Ready&filter_type=Broadcasts`
    for key, value in filters.iteritems():
        s = s.query('term', **{key: value[0]})

    s = s[offset:limit + offset]

    return format_search_results(s.execute())
def autocomplete(query: str) -> Response:
    """
    https://www.elastic.co/guide/en/elasticsearch/guide/current/_index_time_search_as_you_type.html
    We use the ngram-based autocomplete-analyzer for indexing, but the standard analyzer for searching
    This way we enforce that the whole entered word has to be matched (save for some fuzziness) and the algorithm
    does not fall back to matching only the first character in extreme cases. This prevents absurd cases where
    "Garret Walker" and "Hector Mendoza" are suggested when we're entering "Mahatma Ghandi"
    """
    search_query = Search(index=list(DOCUMENT_INDICES.values()))
    search_query = search_query.query(
        "match",
        autocomplete={
            "query": escape_elasticsearch_query(query),
            "analyzer": "standard",
            "fuzziness": "AUTO",
            "prefix_length": 1,
        },
    )
    search_query = search_query.extra(min_score=1)
    search_query = search_query.update_from_dict({
        "indices_boost": [
            {
                DOCUMENT_INDICES["person"]: 4
            },
            {
                DOCUMENT_INDICES["organization"]: 4
            },
            {
                DOCUMENT_INDICES["paper"]: 2
            },
        ]
    })
    response = search_query.execute()
    return response
Beispiel #9
0
def search_by_code(code):

    processed_query = process_commodity_code(code)
    query_object = {"term": {"commodity_code": processed_query}}

    client = Elasticsearch(hosts=[settings.ES_URL])
    hits = Search().index(*alias_names).using(client).query(query_object)
    for hit in hits:
        try:
            hit["hierarchy_context"] = json.loads(hit["hierarchy_context"])
        except KeyError as exception:
            logger.info("{0} {1}".format(hit["commodity_code"],
                                         exception.args))
    return hits
Beispiel #10
0
def _add_date_before(search: Search, params: Dict[str, Any], options,
                     errors) -> Search:
    """Filters by a date given a string, catching parsing errors."""
    try:
        before = parse(params["before"])
    except (ValueError, OverflowError) as e:
        errors.append(
            gettext(
                f"The value for before is invalid. The correct format is 'YYYY-MM-DD' or 'YYYY-MM-DD HH:MM:SS': {e}"
            ))
        return search
    search = search.filter(
        Q("range", start={"lte": before})
        | Q("range", legal_date={"lte": before}))
    options["before"] = before
    return search
Beispiel #11
0
def _build_search_request(query, sort_key, sort_order, filter_on_leaf=None):
    client = Elasticsearch(hosts=[settings.ES_URL])

    sort_object = {sort_key: sort_order}
    query_object = {
        "multi_match": {
            "query": query,
            "type": "most_fields",
            "fields": ["keywords", "description"],
            "operator": "and" if "," not in query else "or",
        }
    }

    request = (Search().index(
        *alias_names).using(client).query(query_object).sort(sort_object))

    if filter_on_leaf:
        request = request.filter("term", leaf=filter_on_leaf)

    return request
 def query(self, search: Search, query: str) -> Search:
     if query:
         self.options["searchterm"] = query
         # Fuzzines AUTO(=2) gives more error tolerance, but is also a lot slower and has many false positives
         # We're using https://stackoverflow.com/a/35375562/3549270 to make exact matches score higher than fuzzy
         # matches
         search = search.query(
             Bool(should=[
                 MultiMatch(
                     query=escape_elasticsearch_query(query),
                     operator="and",
                     fields=self.fields,
                 ),
                 MultiMatch(
                     query=escape_elasticsearch_query(query),
                     operator="and",
                     fields=self.fields,
                     fuzziness="1",
                     prefix_length=1,
                 ),
             ]))
     return search
Beispiel #13
0
    def filter(self, qs, value):
        # инициализируем подключение
        client = Elasticsearch([settings.ELASTICSEARCH_HOST])
        value = value.lower()

        # формируем запрос
        search_query = {
            "bool": {
                "must_not": [  # исключает из выдачи is_published=False
                    {
                        "term": {
                            "is_published": False
                        }
                    }
                ],
                "should": [
                    {
                        "simple_query_string": {  # ищем что-то разумное
                            "fields": ["fullname", "category_name"],
                            "quote_field_suffix": ".exact",
                            "query": value
                        }
                    },
                    {
                        # частичное вхождение по строкам с транслитом (англ->рус)
                        # constant_score запрещает буст по частоте вхождения
                        "constant_score": {
                            "filter": {
                                "match": {
                                    "fullname_translit": {
                                        "query": value,
                                        "fuzziness": 1,
                                        "operator": "and",
                                    }
                                }
                            }
                        }
                    },
                ]
            }
        }

        # Инициализация запроса
        s = Search(using=client, index='offer') \
            .query(search_query)\
            .sort("_score", "-views")\
            .extra(size=self.max_result, from_=0)

        self.hits_list = []
        items = s.execute()
        if items:
            for item in items:
                self.hits_list.append(item.meta.id)
            # нужно для того, чтобы у выборки из пусгреса сохранился порядок, который вернул эластик
            self.hits_order = Case(*[
                When(pk=pk, then=pos) for pos, pk in enumerate(self.hits_list)
            ])
            qs = qs.filter(id__in=self.hits_list).order_by(self.hits_order)
        else:
            qs = qs.none()

        # TODO: старая реализация. Может, оставить, как fallback?
        # else:
        #     qs = qs.annotate(full_name=Concat(
        #         'product__article', Value(' '),
        #         'product__title', Value(' '),
        #         'product__search_title'))
        #     bits = value.split(' ')
        #     if len(bits) is 1 and bits[0].isdecimal():
        #         full_name_clauses = Q(full_name__icontains=bits[0])
        #     else:
        #         full_name_clauses = reduce(
        #             operator.and_,
        #             [Q(full_name__iregex=r'(^|\s)%s' % escape(v))
        #              for v in bits])
        #
        #     unpublished = Category.objects.get_queryset_descendants(
        #         Category.objects.filter(is_published=False),
        #         include_self=True)
        #
        #     qs = (qs.filter(full_name_clauses)
        #             .exclude(product__category__in=unpublished))
        #
        #     if self.uniq_category:
        #         products = (qs.order_by('product__category__title')
        #                       .distinct('product__category__title'))
        #         qs = (qs.filter(id__in=products)
        #                 .order_by('-product__category__views'))

        return qs
class DrugsListView(APIView):
    client = Elasticsearch(hosts=[{"host": "elasticsearch", "port": 9200}])
    search = Search(index='drugs').using(client).sort('trade_name.raw')

    @swagger_auto_schema(manual_parameters=[PAGE_FIELD, QUERY_FIELD])
    def get(self, request):
        page = int(request.GET.get('page', 1))
        page_size = settings.REST_FRAMEWORK.get('PAGE_SIZE')
        query_word = request.GET.get('query', None)
        if not query_word:
            s = self.search.query("match_all")[page - 1:page - 1 + page_size]
            res = s.execute().to_dict()['hits']['hits']
            return Response(data=res, status=status.HTTP_200_OK)
        query_word = query_word.lower() + "*"
        query = {
            "dis_max": {
                "queries": [
                    {
                        "wildcard": {
                            "trade_name": {
                                "value": query_word,
                                "boost": 3.0
                            }
                        }
                    },
                    {
                        "wildcard": {
                            "international_name.name": {
                                "value": query_word,
                                "boost": 3.0
                            }
                        }
                    },
                    {
                        "wildcard": {
                            "formula": {
                                "value": query_word,
                                "boost": 2.0
                            }
                        }
                    },
                    {
                        "wildcard": {
                            "registration number": {
                                "value": query_word,
                                "boost": 1.0
                            }
                        }
                    },
                    {
                        "wildcard": {
                            "INN.name": {
                                "value": query_word,
                                "boost": 0.5
                            }
                        }
                    },
                    {
                        "nested": {
                            "path": "atcs",
                            "query": {
                                "wildcard": {
                                    "atcs.name": {
                                        "value": query_word,
                                        "boost": 0.5
                                    }
                                }
                            }
                        }
                    },
                ],
            }
        }
        s = self.search.query(query)[page - 1:page - 1 + page_size]
        res = s.execute().to_dict()['hits']['hits']
        return Response(data=res, status=status.HTTP_200_OK)

    @swagger_auto_schema(request_body=DrugSerializer)
    def post(self, request):
        serializer = DrugSerializer(data=request.data)
        if serializer.is_valid():
            serializer.create(serializer.validated_data)
            DrugDocument(serializer.validated_data).save(using=self.client)
            return Response(data=serializer.validated_data,
                            status=status.HTTP_201_CREATED)
        return Response(data=serializer.errors,
                        status=status.HTTP_400_BAD_REQUEST)
Beispiel #15
0
    def obj_create(self, bundle, **kwargs):
        user = bundle.request.user
        groups = user.groups.all()

        # if anonymous user search public data only
        query_text = bundle.data.get("text", None)
        type_tag = bundle.data.get("TypeTag", [])
        index_list = []
        for type in type_tag:
            if type == 'Experiment':
                index_list.append('experiments')
            elif type == 'Dataset':
                index_list.append('dataset')
            elif type == 'Datafile':
                index_list.append('datafile')
        end_date = bundle.data.get("EndDate", None)
        start_date = bundle.data.get("StartDate", None)
        if end_date is not None:
            end_date_utc = datetime.datetime.strptime(end_date, "%Y-%m-%dT%H:%M:%S.%fZ") \
                .replace(tzinfo=pytz.timezone('UTC'))
            end_date = end_date_utc.astimezone(LOCAL_TZ).date()
        else:
            # set end date to today's date
            end_date = datetime.datetime.today().replace(
                tzinfo=pytz.timezone('UTC'))
        if start_date:
            start_date_utc = datetime.datetime.strptime(start_date, "%Y-%m-%dT%H:%M:%S.%fZ") \
                .replace(tzinfo=pytz.timezone('UTC'))
            start_date = start_date_utc.astimezone(LOCAL_TZ).date()
        instrument_list = bundle.data.get("InstrumentList", None)
        instrument_list_id = []
        if instrument_list:
            for ins in instrument_list:
                instrument_list_id.append(
                    Instrument.objects.get(name__exact=ins).id)
        # query for experiment model
        ms = MultiSearch(index=index_list)
        if 'experiments' in index_list:
            query_exp = Q("match", title=query_text)
            if user.is_authenticated:
                query_exp_oacl = Q("term", objectacls__entityId=user.id) | \
                                 Q("term", public_access=100)
                for group in groups:
                    query_exp_oacl = query_exp_oacl | \
                                     Q("term", objectacls__entityId=group.id)
            else:
                query_exp_oacl = Q("term", public_access=100)
            if start_date is not None:
                query_exp = query_exp & Q("range",
                                          created_time={
                                              'gte': start_date,
                                              'lte': end_date
                                          })
            query_exp = query_exp & query_exp_oacl
            ms = ms.add(
                Search(index='experiments').extra(
                    size=MAX_SEARCH_RESULTS,
                    min_score=MIN_CUTOFF_SCORE).query(query_exp))
        if 'dataset' in index_list:
            query_dataset = Q("match", description=query_text)
            if user.is_authenticated:
                query_dataset_oacl = Q("term", **{'experiments.objectacls.entityId': user.id}) | \
                                     Q("term", **{'experiments.public_access': 100})
                for group in groups:
                    query_dataset_oacl = query_dataset_oacl | \
                                         Q("term", **{'experiments.objectacls.entityId': group.id})
            else:
                query_dataset_oacl = Q("term",
                                       **{'experiments.public_access': 100})
            if start_date is not None:
                query_dataset = query_dataset & Q("range",
                                                  created_time={
                                                      'gte': start_date,
                                                      'lte': end_date
                                                  })
            if instrument_list:
                query_dataset = query_dataset & Q(
                    "terms", **{'instrument.id': instrument_list_id})
            # add instrument query
            ms = ms.add(
                Search(index='dataset').extra(
                    size=MAX_SEARCH_RESULTS,
                    min_score=MIN_CUTOFF_SCORE).query(query_dataset).query(
                        'nested', path='experiments',
                        query=query_dataset_oacl))
        if 'datafile' in index_list:
            query_datafile = Q("match", filename=query_text)
            if user.is_authenticated:
                query_datafile_oacl = Q("term", **{'dataset.experiments.objectacls.entityId': user.id}) | \
                                      Q("term", **{'dataset.experiments.public_access': 100})
                for group in groups:
                    query_datafile_oacl = query_datafile_oacl | \
                                          Q("term", **{'dataset.experiments.objectacls.entityId': group.id})
            else:
                query_datafile_oacl = Q(
                    "term", **{'dataset.experiments.public_access': 100})
            if start_date is not None:
                query_datafile = query_datafile & Q("range",
                                                    created_time={
                                                        'gte': start_date,
                                                        'lte': end_date
                                                    })
            ms = ms.add(
                Search(index='datafile').extra(
                    size=MAX_SEARCH_RESULTS,
                    min_score=MIN_CUTOFF_SCORE).query(query_datafile).query(
                        'nested',
                        path='dataset.experiments',
                        query=query_datafile_oacl))
        result = ms.execute()
        result_dict = {k: [] for k in ["experiments", "datasets", "datafiles"]}
        for item in result:
            for hit in item.hits.hits:
                if hit["_index"] == "dataset":
                    result_dict["datasets"].append(hit)

                elif hit["_index"] == "experiments":
                    result_dict["experiments"].append(hit)

                elif hit["_index"] == "datafile":
                    result_dict["datafiles"].append(hit)

        if bundle.request.method == 'POST':
            bundle.obj = SearchObject(id=1, hits=result_dict)
        return bundle