Exemple #1
0
    def build(self):
        fs = self._clone()

        for facet in self.facets:
            if "include_%s" % facet.name not in self.args:
                continue

            agg_filter = esd.Q("match_all")
            for inner in self.facets:
                if inner.name != facet.name:
                    if inner.is_filtered(self.args):
                        agg_filter &= inner.filters(self.args)

            for agg_name, agg in facet.aggregates():
                fs.aggs.bucket("_filter_" + agg_name,
                               "filter",
                               filter=agg_filter).bucket(agg_name, agg)

        post_filter = esd.Q('match_all')
        for facet in self.facets:
            if facet.is_filtered(self.args):
                post_filter &= facet.filters(self.args)
        fs.post_filter._proxied &= post_filter

        return fs
def count_by_country(vk_elastic_db: es_client.VkDataDatabaseClient,
                     size=10,
                     is_need_active=False,
                     days_delta=20):
    country_aggs_name = "country_count"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    s = s.filter(
        "bool",
        must=[elasticsearch_dsl.Q("exists", field="country.title.keyword")])
    s = s.filter(
        "bool",
        must_not=[elasticsearch_dsl.Q("match", country__title__keywordd="")])
    a = elasticsearch_dsl.A('terms', field="country.title.keyword", size=size)
    s.aggs.bucket(country_aggs_name, a)
    response = s.execute()

    data = []
    for country_hit in response.aggregations[country_aggs_name].buckets:
        country_dict = {
            'country': country_hit.key,
            'count': country_hit.doc_count
        }
        data.append(country_dict)
    add_geoposition(data)
    return data
Exemple #3
0
 def filters(self, args):
     if self.name in args and args[self.name] == "":
         return esd.Q('missing', field=self.name)
     else:
         return esd.Q('terms', **{self.name: [
             args.get(self.name),
         ]})
Exemple #4
0
def mongodb_condition_to_es(key, value):
    if not isinstance(value, dict):
        if isinstance(value, bson.ObjectId):
            value = bson_oid_to_int(value)
        if isinstance(value, datetime.datetime):
            value = datetime_to_unix(value)
            args = {
                key: {
                    "gte": value,
                    "lte": value,
                    'format': 'epoch_millis'
                }
            }
        else:
            args = {key: {"gte": value, "lte": value}}
        return elasticsearch_dsl.Q("range", **args)

    if len(value) != 1:
        raise NotImplementedError

    op, value = list(value.items())[0]
    if op in ['$gte', '$lte', '$gt', '$lt']:
        if isinstance(value, bson.ObjectId):
            value = bson_oid_to_int(value)
        if isinstance(value, datetime.datetime):
            value = datetime_to_unix(value)
            args = {key: {op[1:]: value, 'format': 'epoch_millis'}}
        else:
            args = {key: {op[1:]: value}}
        return elasticsearch_dsl.Q("range", **args)

    raise NotImplementedError
Exemple #5
0
def add_altitude_info(provincia, municipio=None):
    chunk_size = conf.config.getint('Google Elevation', 'chunk_size')
    print "Chunk Size = " + str(chunk_size)

    try:
        sigpac_record.init()
        time.sleep(5)
    except Exception as e:
        conf.error_handler.error(__name__, "build_record", str(e))
        conf.error_handler.flush()
        raise

    filter = [dsl.Q("term", provincia=provincia)]
    if municipio is not None:
        filter.append(dsl.Q("term", municipio=municipio))

    # query elasticsearch for the neccesary registers
    search = dsl.Search(index='plots').query('bool', filter=filter).fields(
        ['bbox_center.lat', 'bbox_center.lon'])
    search.execute()

    records = []
    centers = []
    for r in search.scan():
        record = sigpac_record(meta={'id': r.meta.id})

        records.append(record)
        centers.append((r['bbox_center.lat'][0], r['bbox_center.lon'][0]))

        if len(records) >= chunk_size:
            print "Inserting next " + str(chunk_size) + " elevations"
            try:
                records = obtain_elevation_from_google(records, centers)
                print " ... Obtained info from google"
                util.elastic_bulk_update(records)
                print " ...success"
            except ConnectionError as e:
                print " ...error"
                conf.error_handler.error(__name__,
                                         'obtain_elevation_from_google',
                                         e.message)

            records = []
            centers = []

    if len(records) > 0:
        try:
            records = obtain_elevation_from_google(records, centers)
            util.elastic_bulk_update(records)
        except ConnectionError as e:
            conf.error_handler.error(__name__, 'obtain_elevation_from_google',
                                     e.message)
Exemple #6
0
 def filters(self, args):
     if self.name in args and len(self.name) > 0:
         return esd.Q('prefix', **{
             self.name: args.get(self.name),
         })
     else:
         return super().filters(args)
Exemple #7
0
    def list(self, filter, sort, page, page_size, user):
        # Rip the search object out of the elasticsearch backend
        try:
            search = self.parents[-1]._state._backend.raw_backend().search
        except AttributeError:
            print('Tried to search on an unsearchable collection')
            raise

        if self._request.query_arguments.get('q'):
            search = search.query(
                elasticsearch_dsl.Q('query_string',
                                    query=self._request.query_arguments['q']
                                    [-1].decode('utf-8')))
        else:
            # This should technically be elsewhere but the search object
            # does not provide a nice way to figure out if there is a query or not.
            search = search.sort(
                {'ref': {
                    'order': 'asc',
                    'unmapped_type': 'string'
                }})

        if self._request.query_arguments.get('sort'):
            search = search.sort({
                sort.key: {
                    'order': 'asc' if sort.order == 1 else 'desc',
                    'unmapped_type': 'string'
                }
            })

        start = page * page_size
        search = search[start:start + page_size]

        return SearchResultWrapper(search)
Exemple #8
0
 def filters(self):
     range = {}
     if self.value[0] is not None:
         range["from"] = self.value[0]
     if self.value[1] is not None:
         range["to"] = self.value[1]
     return esd.Q('range', **{self.name: range})
    def get_cost_by_product(cls,
                            key,
                            date_from=None,
                            date_to=None,
                            without_discount=False,
                            only_discount=False,
                            size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter('term', linked_account_id=key)
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        if without_discount:
            s = s.query(
                'bool',
                filter=[
                    ~dsl.Q('term', item_description='PAR_APN_ProgramFee_2500')
                ])
        if only_discount:
            s = s.filter('term', item_description='PAR_APN_ProgramFee_2500')
        agg = s.aggs.bucket('products',
                            'terms',
                            field='product_name',
                            order={'cost': 'desc'},
                            size=size)
        agg.bucket('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        products = [{
            'product': SHORT_NAMES.get(product['key'], product['key']),
            'cost': product['cost']['value'],
        } for product in res['aggregations']['products']['buckets']]
        return dict(products=products)
 def construct_regexp_query(field_in: str, regex: str):
     q = None
     for field in extract_field_plus_rawfield(field_in):
         q_tmp = es_dsl.Q("regexp", **{field: regex})
         if not q:
             q = q_tmp
         else:
             q = q | q_tmp
     return q
Exemple #11
0
 def occam_search(self, search_input, index_name='occam_index'):
     # not done just matches orgional test functionality (probably)
     # res = self.es.search(index=index_name, body={"from":0,"size":10,"query":{"match":{"description":search_input}}})
     res = elasticsearch_dsl.Search(using=self.es, index=index_name)
     # print('<for debug/dev> number of objects in index ' + str(res.count()))
     q = elasticsearch_dsl.Q('multi_match', query=search_input, fields=['name', 'summary', 'environment'])
     ans = res.query(q)
     answer = ans.execute()
     # returns an array of dicts that conatin the data occam requres from search
     return help_return(answer)
Exemple #12
0
    def build_es_query(self, ):
        qd = self._qd

        rangeq = elasticsearch_dsl.Q(
            'range', **{
                '{}'.format(self._ts_field): {
                    'from': self._start_time.format(self._ISO_TS),
                    'to': self._end_time.format(self._ISO_TS)
                }
            })
        luceneq = elasticsearch_dsl.Q('query_string',
                                      query=qd['query_opts']['args'])

        if isinstance(self.exclude, list):
            self.exclude = ' OR '.join(self.exclude)
        excludeq = elasticsearch_dsl.Q('query_string', query=self.exclude)

        s = elasticsearch_dsl.Search()
        s = s.source(include=qd['fields'])
        q = elasticsearch_dsl.Q('bool',
                                must=[luceneq, rangeq],
                                must_not=excludeq)
        s = s.query(q)

        # aggs
        if 'aggs' in qd and len(qd.get('aggs')) >= 1:
            aggs = qd['aggs']
            try:
                aggobj = self.build_aggs(s.aggs, aggs[0])
            except Exception as e:
                raise TQLException(
                    "Unable to agg base: reason: {}, agg: {}".format(e, aggs))

            if len(aggs) > 1:
                aggobj = self.build_aggs(aggobj, aggs[1:])
            s = s[self.agg_size_from:self.agg_size]
        else:
            s = s[self.hit_size_from:self.hit_size]

        self._esq = s
        self._esqd = self._esq.to_dict()
Exemple #13
0
def search_by_keywords(terms):
    # create the query
    terms_list = terms.split()
    should = []
    for term in terms_list:
        query = elasticsearch_dsl.Q("match", keywords=term)
        should.append(query)

    # perform the query
    q = elasticsearch_dsl.Q("bool", should=should, minimum_should_match=1)
    s = elasticsearch_dsl.Search(using=es, index="tweet").query(q)

    # return the first 250 hits
    results = s[:250]
    tweets = [dict(hit._d_) for hit in results]

    for tweet in tweets:
        del tweet["raw"]
        tweet["id"] = "https://twitter.com/statuses" + tweet["id"]

    return tweets
 def construct_unary_op(arg: ast.Node, query_creator):
     q = None
     if is_a(arg, op.ARG_LOGICAL):
         queries = []
         for n in arg.children:
             if is_a(n, op.ARGS):
                 queries.append(query_creator(n))
             elif is_a(n, op.ARG_LOGICAL):
                 l_queries = []
                 for child in n.children:
                     l_queries.append(query_creator(child))
                 if is_a(n, op.ARG_AND):
                     queries.append(es_dsl.Q("bool", must=l_queries))
                 elif is_a(n, op.ARG_OR):
                     queries.append(es_dsl.Q("bool", should=l_queries))
                 else:
                     queries.append(es_dsl.Q("bool", must_not=l_queries))
         if is_a(arg, op.ARG_AND):
             q = es_dsl.Q("bool", must=queries)
         elif is_a(arg, op.ARG_OR):
             q = es_dsl.Q("bool", should=queries)
         else:
             q = es_dsl.Q("bool", must_not=queries)
     else:
         q = query_creator(arg)
     return q
    def search_ids(self, args, resource_id: str, entry_ids: str):
        logger.info(
            "Called EsSearch.search_ids(self, args, resource_id, entry_ids) with:"
        )
        logger.info("  resource_id = {}".format(resource_id))
        logger.info("  entry_ids = {}".format(entry_ids))
        entries = entry_ids.split(",")
        query = es_dsl.Q("terms", _id=entries)
        logger.debug("query = {}".format(query))
        s = es_dsl.Search(using=self.es, index=resource_id).query(query)
        logger.debug("s = {}".format(s.to_dict()))
        response = s.execute()

        return self._format_result([resource_id], response)
Exemple #16
0
 def aggregate(self, search):
     """
     Add aggregations representing the facets selected, including potential
     filters.
     """
     for f, facet in iteritems(self.facets):
         agg = facet.get_aggregation()
         agg_filter = esd.Q('match_all')
         for field, filter in iteritems(self._filters):
             if f == field or (f.startswith("date") and field.startswith("date")):
                 continue
             agg_filter &= filter
         search.aggs.bucket(
             '_filter_' + f,
             'filter',
             filter=agg_filter
         ).bucket(f, agg)
    def get_yearly_cost_by_product(cls,
                                   keys,
                                   date_from=None,
                                   date_to=None,
                                   size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            month=1, day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(month=12,
                                               day=31,
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='year',
                            min_doc_count=1)
        agg = agg.bucket('products', 'terms', field='product_name', size=size)
        agg.metric('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        res = [{
            'year':
            interval['key_as_string'][:4],
            'products': [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
            } for product in interval['products']['buckets']]
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(years=res)
Exemple #18
0
    def _do_query(self, value, term):
        search = dsl.Search(using=self.esclient,
                            index=self.user_id,
                            doc_type='indexed_message')
        term_query = dsl.Q('term', **{term: value})
        search = search.query('nested',
                              path='participants',
                              score_mode='avg',
                              query=term_query)

        search.aggs.bucket('messages_with_value',
                           'date_histogram',
                           field='date',
                           interval=self.resolution)

        r = search.execute()
        results = r.aggregations.messages_with_value['buckets']
        return self._format_results(results)
Exemple #19
0
    def get(self, handler):
        # Rip the search object out of the elasticsearch backend
        sort = handler.sort
        search = self.collection._state._backend.raw_backend().search

        if handler.request.query_arguments.get('q'):
            search = search.query(
                elasticsearch_dsl.Q('query_string',
                                    query=handler.request.query_arguments['q']
                                    [-1].decode('utf-8')))
        else:
            # This should technically be elsewhere but the search object
            # does not provide a nice way to figure out if there is a query or not.
            search = search.sort(
                {'ref': {
                    'order': 'asc',
                    'unmapped_type': 'string'
                }})

        if handler.request.query_arguments.get('sort'):
            search = search.sort({
                sort.key: {
                    'order': 'asc' if sort.order == 1 else 'desc',
                    'unmapped_type': 'string'
                }
            })

        # Hacking into the serializer
        handler._serializer = self.get_serializer()
        handler._view.parents = handler._view.parents + (self.collection, )

        start = handler.page * handler.page_size
        wrapper = SearchResultWrapper(search[start:start + handler.page_size])
        return handler.write({
            'meta': {
                'total': wrapper.count(),
                'perPage': handler.page_size
            },
            # TODO
            'links': {},
            'data': [handler.serialize(resource) for resource in wrapper]
        })
Exemple #20
0
    def _filter(self,
                criteria: Q,
                offset: int = 0,
                limit: int = 10,
                order_by: list = ()) -> ResultSet:
        """
        Filter objects from the data store. Method must return a `ResultSet`
        object
        """
        conn = self.provider.get_connection()

        # Build the filters from the criteria
        q = elasticsearch_dsl.Q()
        if criteria.children:
            q = self._build_filters(criteria)

        s = (Search(
            using=conn,
            index=self.model_cls._index._name).query(q).params(version=True))

        if order_by:
            s = s.sort(*order_by)

        s = s[offset:offset + limit]

        # Return the results
        try:
            response = s.execute()
            result = ResultSet(
                offset=offset,
                limit=limit,
                total=response.hits.total.value,
                items=response.hits,
            )
        except Exception as exc:
            logger.error(f"Error while filtering: {exc}")
            raise

        return result
Exemple #21
0
    def _delete_all(self, criteria: Q = None):
        """Delete all records matching criteria from the Repository"""
        conn = self._get_session()

        # Build the filters from the criteria
        q = elasticsearch_dsl.Q()
        if criteria and criteria.children:
            q = self._build_filters(criteria)

        s = Search(using=conn, index=self.model_cls._index._name).query(q)

        # Return the results
        try:
            response = s.delete()

            # `Search.delete` does not refresh index, so we have to manually refresh
            index = Index(name=self.entity_cls.meta_.schema_name, using=conn)
            index.refresh()
        except Exception as exc:
            logger.error(f"Error while deleting records: {exc}")
            raise

        return response.deleted
Exemple #22
0
    def get(self, request: http.HttpRequest
            ) -> typing.Union[tuple, http.HttpResponse]:
        #  site_models is really slow. so I use settings object instead
        #  site = site_models.Site.objects.get_current()
        search = 0
        p_c = None
        is_a_search = False
        form = artisan_forms.PostListSearch(request.GET)
        if form.is_valid(
        ):  ## could make a search object factory class to hide implementation of search,
            is_a_search = True  ##  to allow search method (elasticsearch, postgres full text etc) to be changed
            terms = form.cleaned_data['q'].split(' ')
            if len(terms) > 1:
                t = 'terms'
            else:
                t = 'match'
                terms = terms[0]
            queryset = artisan_documents.Post.search().query(
                elasticsearch_dsl.Q(t, text=terms)
                | elasticsearch_dsl.Q(t, author=terms)
                | elasticsearch_dsl.Q(t, title=terms)
                | elasticsearch_dsl.Q(t, category=terms)
                | elasticsearch_dsl.Q(t, location=terms)).to_queryset()
            queryset_comments = forum_documents.Comment.search().query(
                elasticsearch_dsl.Q(t, text=terms)
                | elasticsearch_dsl.Q(t, author=terms)).to_queryset()
            for sr in queryset_comments:
                queryset = queryset | artisan_models.Post.objects.filter(
                    id=sr.post_fk.id)
            time_range = eval(
                'form.' +
                form['published'].value())  #### TODO !!! eval is evil.
            search = len(queryset)
            if search and time_range:
                queryset = (
                    queryset.filter(
                        created_at__lt=time_range[0],
                        created_at__gt=time_range[1]).order_by('-pinned').
                    select_related('author').select_related('author__profile').
                    select_related('author__profile__avatar'))
                search = len(queryset)
            if not search:
                queryset = (artisan_models.Post.objects.select_related(
                    'author').select_related('author__profile').select_related(
                        'author__profile__avatar').order_by('-pinned'))
        else:
            form.errors.clear()
            queryset = (artisan_models.Post.objects.select_related(
                'author').select_related('author__profile').select_related(
                    'author__profile__avatar').order_by('-pinned'))

        paginator = pagination.Paginator(queryset, self.paginate_by)

        page_number = request.GET.get('page')
        page_obj = paginator.get_page(page_number)
        context = {
            'form':
            form,
            'page_obj':
            page_obj,
            'search':
            search,
            'is_a_search':
            is_a_search,
            'site_url': ('https' if self.request.is_secure() else 'http') +
            '://' + conf.settings.SITE_DOMAIN
        }
        return shortcuts.render(request, self.template_name, context)
 def construct_exists_query(node: ast.Node):
     return es_dsl.Q("exists", field=get_value(node))
 def construct_freetext_query(node):
     if is_a(node, op.STRING):
         return es_dsl.Q("multi_match", query=get_value(node), fuzziness=1)
     else:
         return es_dsl.Q("multi_match", query=get_value(node))
 def construct_range_query(field: str, range_args: Dict):
     return es_dsl.Q("range", **{field: range_args})
def create_es_query(node: ast.Node):
    node.pprint(0)
    if node is None:
        raise TypeError()

    def extract_values(n: ast.Node) -> List[ast.AnyValue]:
        values = []
        if is_a(n, op.ARG_LOGICAL):
            for child in n.children:
                values.append(get_value(child))

        return values

    def extract_values_and_logicals(
        n: ast.Node,
    ) -> Tuple[List[ast.AnyValue], List[ast.Node]]:
        values = []
        logicals = []
        if is_a(n, op.ARG_LOGICAL):
            for child in n.children:
                if is_a(child, op.ARG_LOGICAL):
                    logicals.append(child)
                else:
                    values.append(get_value(child))
        return values, logicals

    def construct_binary_op(field: ast.Node, arg: ast.Node, query_creator):
        q = None
        if is_a(field, op.ARG_LOGICAL):
            if is_a(arg, op.ARG_LOGICAL):
                pass
            else:
                queries = []
                for n in field.children:
                    if is_a(n, op.ARGS):
                        queries.append(query_creator(n, arg))
                    elif is_a(n, op.ARG_LOGICAL):
                        l_queries = []
                        for child in n.children:
                            l_queries.append(query_creator(child, arg))
                        if is_a(n, op.ARG_AND):
                            queries.append(es_dsl.Q("bool", must=l_queries))
                        elif is_a(n, op.ARG_OR):
                            queries.append(es_dsl.Q("bool", should=l_queries))
                        else:
                            queries.append(es_dsl.Q("bool", must_not=l_queries))
                if is_a(arg, op.ARG_AND):
                    q = es_dsl.Q("bool", must=queries)
                elif is_a(arg, op.ARG_OR):
                    q = es_dsl.Q("bool", should=queries)
                else:
                    q = es_dsl.Q("bool", must_not=queries)
                if not arg_values:

                    def prepare_equals_arg(node, v):
                        return v

                    arg = prepare_equals_arg(node, get_value(arg2))
                    queries = [
                        construct_equals_query(field, arg) for field in field_values
                    ]
                    for logical in field_logicals:
                        l_queries = []
                        for field in logical.children:
                            l_queries.append(construct_equals_query(field.value, arg))
                        if is_a(logical, op.ARG_OR):
                            queries.append(es_dsl.Q("bool", should=l_queries))
                        elif is_a(logical, op.ARG_AND):
                            queries.append(es_dsl.Q("bool", must=l_queries))
                        else:
                            queries.append(es_dsl.Q("bool", must_not=l_queries))
                    print(
                        "|create_es_query::EQUALS| queries = {queries}".format(
                            queries=queries
                        )
                    )
                    if is_a(arg1, op.ARG_AND):
                        q = es_dsl.Q("bool", must=queries)
                    elif is_a(arg1, op.ARG_OR):
                        q = es_dsl.Q("bool", should=queries)
                    else:
                        q = es_dsl.Q(
                            "bool",
                            must_not=queries,
                            must=es_dsl.Q("multi_match", query=get_value(arg2)),
                        )
        else:
            if is_a(arg, op.ARG_LOGICAL):
                queries = []
                for n in arg.children:
                    if is_a(n, op.ARGS):
                        queries.append(query_creator(field, n))
                    elif is_a(n, op.ARG_LOGICAL):
                        l_queries = []
                        for child in n.children:
                            l_queries.append(query_creator(field, child))
                        if is_a(n, op.ARG_AND):
                            queries.append(es_dsl.Q("bool", must=l_queries))
                        elif is_a(n, op.ARG_OR):
                            queries.append(es_dsl.Q("bool", should=l_queries))
                        else:
                            queries.append(es_dsl.Q("bool", must_not=l_queries))
                if is_a(arg, op.ARG_AND):
                    q = es_dsl.Q("bool", must=queries)
                elif is_a(arg, op.ARG_OR):
                    q = es_dsl.Q("bool", should=queries)
                else:
                    q = es_dsl.Q("bool", must_not=queries)
            else:
                q = query_creator(field, arg)
        return q

    def construct_equals_query(field: ast.Node, query: ast.Node):
        kwargs = {get_value(field): {"query": get_value(query), "operator": "and"}}
        return es_dsl.Q("match", **kwargs)

    q = None
    if is_a(node, op.LOGICAL):
        # TODO check minimum should match rules in different contexts
        queries = [create_es_query(n) for n in node.children]
        if len(queries) == 2:
            q1 = queries[0]
            q2 = queries[1]
            print("q1 = {}".format(q1.to_dict()))
            print("q2 = {}".format(repr(q2)))
            q1_dict = q1.to_dict()
            q2_dict = q2.to_dict()
            if "range" in q1_dict and "range" in q2_dict:
                for q1_field, q1_value in q1_dict["range"].items():
                    for q2_field, q2_value in q2_dict["range"].items():
                        if q1_field == q2_field:
                            print("q1_field == q2_field")
                            range_args = q1_value
                            range_args.update(q2_value)
                            print("field = {}".format(q1_field))
                            print("range_args = {}".format(range_args))
                            q = es_dsl.Q("range", **{q1_field: range_args})
                            return q
        if is_a(node, op.AND):
            q = es_dsl.Q("bool", must=queries)
        elif is_a(node, op.OR):
            q = es_dsl.Q("bool", should=queries)
        else:
            q = es_dsl.Q("bool", must_not=queries)
    elif is_a(node, op.UNARY_OPS):
        arg = node.children[0]
        arg_values, arg_logicals = extract_values_and_logicals(arg)
        print("arg_values = {}".format(arg_values))
        print("arg_logicals = {}".format(arg_logicals))

        def construct_unary_op(arg: ast.Node, query_creator):
            q = None
            if is_a(arg, op.ARG_LOGICAL):
                queries = []
                for n in arg.children:
                    if is_a(n, op.ARGS):
                        queries.append(query_creator(n))
                    elif is_a(n, op.ARG_LOGICAL):
                        l_queries = []
                        for child in n.children:
                            l_queries.append(query_creator(child))
                        if is_a(n, op.ARG_AND):
                            queries.append(es_dsl.Q("bool", must=l_queries))
                        elif is_a(n, op.ARG_OR):
                            queries.append(es_dsl.Q("bool", should=l_queries))
                        else:
                            queries.append(es_dsl.Q("bool", must_not=l_queries))
                if is_a(arg, op.ARG_AND):
                    q = es_dsl.Q("bool", must=queries)
                elif is_a(arg, op.ARG_OR):
                    q = es_dsl.Q("bool", should=queries)
                else:
                    q = es_dsl.Q("bool", must_not=queries)
            else:
                q = query_creator(arg)
            return q

        def construct_exists_query(node: ast.Node):
            return es_dsl.Q("exists", field=get_value(node))

        if is_a(node, op.FREETEXT):

            def construct_freetext_query(node):
                if is_a(node, op.STRING):
                    return es_dsl.Q("multi_match", query=get_value(node), fuzziness=1)
                else:
                    return es_dsl.Q("multi_match", query=get_value(node))

            if not arg_values:
                q = construct_freetext_query(arg)
            else:
                queries = [construct_freetext_query(n) for n in arg.children]
                if is_a(arg, op.ARG_OR):
                    q = es_dsl.Q("bool", should=queries)
                elif is_a(arg, op.ARG_AND):
                    q = es_dsl.Q("bool", must=queries)
                else:
                    q = es_dsl.Q("bool", must_not=queries)
        elif is_a(node, op.FREERGXP):
            kwargs = {"default_field": "*"}
            if not arg_values:
                kwargs["query"] = "/{}/".format(arg.value)
            else:
                if is_a(arg, op.ARG_OR):
                    operator = " OR "
                else:  # if is_a(arg, op.ARG_AND):
                    operator = " AND "
                kwargs["query"] = operator.join("(/{}/)".format(v) for v in arg_values)
            print("kwargs = {}".format(kwargs))
            q = es_dsl.Q("query_string", **kwargs)
            if is_a(arg, op.ARG_NOT):
                q = es_dsl.Q("bool", must_not=q)
        elif is_a(node, op.EXISTS):
            q = construct_unary_op(node.children[0], construct_exists_query)

        elif is_a(node, op.MISSING):
            if not arg_values:
                q = es_dsl.Q("bool", must_not=es_dsl.Q("exists", field=arg.value))
            else:
                queries = [es_dsl.Q("exists", field=value) for value in arg_values]
                for logical in arg_logicals:
                    l_queries = []
                    for field in logical.children:
                        l_queries.append(construct_exists_query(field))
                    if is_a(logical, op.ARG_AND):
                        queries.append(es_dsl.Q("bool", must=l_queries))
                    elif is_a(logical, op.ARG_OR):
                        queries.append(es_dsl.Q("bool", should=l_queries))
                    else:  # if is_a(logical, op.ARG_NOT):
                        queries.append(es_dsl.Q("bool", must_not=l_queries))

                if is_a(arg, op.ARG_AND):
                    q = es_dsl.Q("bool", must_not=queries)
                elif is_a(arg, op.ARG_OR):
                    q = None
                    for query in queries:
                        q_tmp = es_dsl.Q("bool", must_not=query)
                        if not q:
                            q = q_tmp
                        else:
                            q = q | q_tmp
                else:  # is_a(arg, op.ARG_NOT):
                    q = es_dsl.Q("bool", must=queries)
        else:
            raise UnsupportedQuery("not implemented")
    elif is_a(node, op.BINARY_OPS):
        arg1 = node.children[0]
        arg2 = node.children[1]
        field_values, field_logicals = extract_values_and_logicals(arg1)
        arg_values = extract_values(arg2)

        print("field_values = {field_values}".format(field_values=field_values))
        print("field_logicals = {field_logicals}".format(field_logicals=field_logicals))

        print("arg_values = {}".format(arg_values))
        # TODO this check breaks and and or since they always (?) have ast.ArgNode as parameters
        # if not isinstance(arg1, ast.ArgNode) or not isinstance(arg2, ast.ArgNode):
        # TODO these need to be moved outside of current query, for example:
        # "equals|name||or|Partille|Kumla" could be expressed in two ways
        # es_dsl.Q('terms', name=['Partille', 'Kumla'])
        # or
        # es_dsl.Q('bool', should=[es_dsl.Q('term', name='Partille'), es_dsl.Q('term', name='Kumla')])
        # but "regexp|name||or|Part*|Kum*"
        # can only be expressed as in the longer form above
        # raise UnsupportedQuery()

        if is_a(node, op.EQUALS):

            # q = construct_binary_op(node.children[0], node.children[1], construct_equals_query)
            # if len(field_values) == 1:
            if not field_values and not field_logicals:
                if not arg_values:
                    q = construct_equals_query(arg1, arg2)
                else:
                    queries = [
                        construct_equals_query(get_value(arg1), query)
                        for query in arg_values
                    ]
                    if is_a(arg2, op.ARG_AND):
                        q = es_dsl.Q("bool", must=queries)
                    elif is_a(arg2, op.ARG_OR):
                        q = es_dsl.Q("bool", should=queries)
                    else:
                        q = es_dsl.Q(
                            "bool", must_not=queries
                        )  # , must=es_dsl.Q('multi_match', query=get_value(arg1)))

            else:  # if field_values:
                # if len(arg_values) == 1:
                if not arg_values:

                    def prepare_equals_arg(node, v):
                        return v

                    arg = prepare_equals_arg(node, get_value(arg2))
                    queries = [
                        construct_equals_query(field, arg) for field in field_values
                    ]
                    for logical in field_logicals:
                        l_queries = []
                        for field in logical.children:
                            l_queries.append(construct_equals_query(field.value, arg))
                        if is_a(logical, op.ARG_OR):
                            queries.append(es_dsl.Q("bool", should=l_queries))
                        elif is_a(logical, op.ARG_AND):
                            queries.append(es_dsl.Q("bool", must=l_queries))
                        else:
                            queries.append(es_dsl.Q("bool", must_not=l_queries))
                    print(
                        "|create_es_query::EQUALS| queries = {queries}".format(
                            queries=queries
                        )
                    )
                    if is_a(arg1, op.ARG_AND):
                        q = es_dsl.Q("bool", must=queries)
                    elif is_a(arg1, op.ARG_OR):
                        q = es_dsl.Q("bool", should=queries)
                    else:
                        q = es_dsl.Q(
                            "bool",
                            must_not=queries,
                            must=es_dsl.Q("multi_match", query=get_value(arg2)),
                        )
                else:  # if arg_values:
                    raise UnsupportedQuery("Don't know how to handle ")

        elif is_a(node, op.REGEX_OPS):

            def extract_field_plus_rawfield(field: str):
                if field.endswith(".raw"):
                    yield field
                else:
                    yield field
                    yield field + ".raw"

            def prepare_regex(node, s: str):
                if is_a(node, op.CONTAINS):
                    return ".*" + re.escape(s) + ".*"
                elif is_a(node, op.STARTSWITH):
                    return re.escape(s) + ".*"
                elif is_a(node, op.ENDSWITH):
                    return ".*" + re.escape(s)
                else:
                    return s

            # Construct query
            def construct_regexp_query(field_in: str, regex: str):
                q = None
                for field in extract_field_plus_rawfield(field_in):
                    q_tmp = es_dsl.Q("regexp", **{field: regex})
                    if not q:
                        q = q_tmp
                    else:
                        q = q | q_tmp
                return q

            q = None
            if not field_values and not field_logicals:
                if not arg_values:
                    q = construct_regexp_query(
                        get_value(arg1), prepare_regex(node, get_value(arg2))
                    )
                else:
                    queries = []
                    for regex in arg_values:
                        q_tmp = construct_regexp_query(
                            get_value(arg1), prepare_regex(node, regex)
                        )
                        print("q_tmp = {q_tmp}".format(q_tmp=q_tmp))

                        queries.append(q_tmp)

                    if is_a(arg2, op.ARG_OR):
                        q = es_dsl.Q("bool", should=queries)
                    elif is_a(arg2, op.ARG_AND):
                        q = es_dsl.Q("bool", must=queries)
                    else:
                        q = es_dsl.Q(
                            "bool", must_not=queries
                        )  # , must=es_dsl.Q('query_string', **kwargs))
            else:  # if field_values:
                if not arg_values:
                    regex = prepare_regex(node, get_value(arg2))
                    queries = []
                    for field in field_values:
                        q_tmp = construct_regexp_query(field, regex)
                        print("q_tmp = {q_tmp}".format(q_tmp=q_tmp))

                        queries.append(q_tmp)
                    for logical in field_logicals:
                        l_queries = []
                        for field in logical.children:
                            l_queries.append(construct_regexp_query(field.value, regex))
                        if is_a(logical, op.ARG_OR):
                            queries.append(es_dsl.Q("bool", should=l_queries))
                        elif is_a(logical, op.ARG_AND):
                            queries.append(es_dsl.Q("bool", must=l_queries))
                        else:
                            queries.append(es_dsl.Q("bool", must_not=l_queries))

                    print(
                        "|create_es_query::REGEX| queries = {queries}".format(
                            queries=queries
                        )
                    )
                    if is_a(arg1, op.ARG_OR):
                        q = es_dsl.Q("bool", should=queries)
                    elif is_a(arg1, op.ARG_AND):
                        q = es_dsl.Q("bool", must=queries)
                    else:
                        kwargs = {"default_field": "*"}
                        kwargs["query"] = "/{}/".format(regex)

                        print("regex NOT kwargs = {}".format(kwargs))

                        q = es_dsl.Q(
                            "bool",
                            must_not=queries,
                            must=es_dsl.Q("query_string", **kwargs),
                        )
                else:
                    raise UnsupportedQuery("Complex regex not implemented")

        elif is_a(node, op.RANGE_OPS):
            if arg_values:
                raise UnsupportedQuery(
                    "Not allowed to use logical operators in 2nd argument for RANGE operators."
                )

            def prepare_range_args(node, arg22):
                range_args = {}
                if is_a(node, op.LT):
                    range_args["lt"] = arg22
                elif is_a(node, op.LTE):
                    range_args["lte"] = arg22
                elif is_a(node, op.GT):
                    range_args["gt"] = arg22
                elif is_a(node, op.GTE):
                    range_args["gte"] = arg22
                return range_args

            range_args = prepare_range_args(node, get_value(arg2))

            def construct_range_query(field: str, range_args: Dict):
                return es_dsl.Q("range", **{field: range_args})

            q = None
            if not field_values:
                q = construct_range_query(get_value(arg1), range_args)
            else:
                queries = [construct_range_query(f, range_args) for f in field_values]
                for logical in field_logicals:
                    l_queries = []
                    for field in logical.children:
                        l_queries.append(construct_range_query(field.value, range_args))
                    if is_a(logical, op.ARG_OR):
                        queries.append(es_dsl.Q("bool", should=l_queries))
                    elif is_a(logical, op.ARG_AND):
                        queries.append(es_dsl.Q("bool", must=l_queries))
                    else:
                        queries.append(es_dsl.Q("bool", must_not=l_queries))
                print(
                    "|create_es_query::RANGE| queries = {queries}".format(
                        queries=queries
                    )
                )
                if is_a(arg1, op.ARG_AND):
                    q = es_dsl.Q("bool", must=queries)
                elif is_a(arg1, op.ARG_OR):
                    q = es_dsl.Q("bool", should=queries)
                else:
                    q = es_dsl.Q("bool", must_not=queries)
    # elif isinstance(node, ast.TernaryOp):
    #     op = node.value
    #     arg1 = node.children[0]
    #     arg2 = node.children[1]
    #     arg3 = node.children[2]
    #     if op == query_dsl.Operators.RANGE:
    #         raise UnsupportedQuery('don\'t now what to do yet')
    #     else:
    #         raise UnsupportedQuery('what operators?')
    else:
        raise UnsupportedQuery("Unknown query op '{node}'".format(node=node))

    return q
    def construct_binary_op(field: ast.Node, arg: ast.Node, query_creator):
        q = None
        if is_a(field, op.ARG_LOGICAL):
            if is_a(arg, op.ARG_LOGICAL):
                pass
            else:
                queries = []
                for n in field.children:
                    if is_a(n, op.ARGS):
                        queries.append(query_creator(n, arg))
                    elif is_a(n, op.ARG_LOGICAL):
                        l_queries = []
                        for child in n.children:
                            l_queries.append(query_creator(child, arg))
                        if is_a(n, op.ARG_AND):
                            queries.append(es_dsl.Q("bool", must=l_queries))
                        elif is_a(n, op.ARG_OR):
                            queries.append(es_dsl.Q("bool", should=l_queries))
                        else:
                            queries.append(es_dsl.Q("bool", must_not=l_queries))
                if is_a(arg, op.ARG_AND):
                    q = es_dsl.Q("bool", must=queries)
                elif is_a(arg, op.ARG_OR):
                    q = es_dsl.Q("bool", should=queries)
                else:
                    q = es_dsl.Q("bool", must_not=queries)
                if not arg_values:

                    def prepare_equals_arg(node, v):
                        return v

                    arg = prepare_equals_arg(node, get_value(arg2))
                    queries = [
                        construct_equals_query(field, arg) for field in field_values
                    ]
                    for logical in field_logicals:
                        l_queries = []
                        for field in logical.children:
                            l_queries.append(construct_equals_query(field.value, arg))
                        if is_a(logical, op.ARG_OR):
                            queries.append(es_dsl.Q("bool", should=l_queries))
                        elif is_a(logical, op.ARG_AND):
                            queries.append(es_dsl.Q("bool", must=l_queries))
                        else:
                            queries.append(es_dsl.Q("bool", must_not=l_queries))
                    print(
                        "|create_es_query::EQUALS| queries = {queries}".format(
                            queries=queries
                        )
                    )
                    if is_a(arg1, op.ARG_AND):
                        q = es_dsl.Q("bool", must=queries)
                    elif is_a(arg1, op.ARG_OR):
                        q = es_dsl.Q("bool", should=queries)
                    else:
                        q = es_dsl.Q(
                            "bool",
                            must_not=queries,
                            must=es_dsl.Q("multi_match", query=get_value(arg2)),
                        )
        else:
            if is_a(arg, op.ARG_LOGICAL):
                queries = []
                for n in arg.children:
                    if is_a(n, op.ARGS):
                        queries.append(query_creator(field, n))
                    elif is_a(n, op.ARG_LOGICAL):
                        l_queries = []
                        for child in n.children:
                            l_queries.append(query_creator(field, child))
                        if is_a(n, op.ARG_AND):
                            queries.append(es_dsl.Q("bool", must=l_queries))
                        elif is_a(n, op.ARG_OR):
                            queries.append(es_dsl.Q("bool", should=l_queries))
                        else:
                            queries.append(es_dsl.Q("bool", must_not=l_queries))
                if is_a(arg, op.ARG_AND):
                    q = es_dsl.Q("bool", must=queries)
                elif is_a(arg, op.ARG_OR):
                    q = es_dsl.Q("bool", should=queries)
                else:
                    q = es_dsl.Q("bool", must_not=queries)
            else:
                q = query_creator(field, arg)
        return q
def count_by_city_order_by_country(
        vk_elastic_db: es_client.VkDataDatabaseClient,
        size=10,
        is_need_other=True,
        is_need_print=False,
        is_need_plot=True,
        is_need_active=False,
        days_delta=20):
    country_aggs_name = "country_count"
    city_aggs_name = "city_count"
    title = "count by city"
    if is_need_active:
        title += " active"
    es = get_elastic_object(vk_elastic_db)
    s = elasticsearch_dsl.Search(using=es, index=index)
    if is_need_active:
        s = get_active_users_filter(es, index, s, days_delta=days_delta)
    s = s.filter(
        "bool",
        must=[elasticsearch_dsl.Q("exists", field="country.title.keyword")])
    s = s.filter(
        "bool",
        must=[elasticsearch_dsl.Q("exists", field="city.title.keyword")])
    s = s.filter(
        "bool",
        must_not=[elasticsearch_dsl.Q("match", country__title__keywordd="")])
    s = s.filter(
        "bool",
        must_not=[elasticsearch_dsl.Q("match", city__title__keyword="")])
    a = elasticsearch_dsl.A('terms',
                            field="country.title.keyword",
                            size=size,
                            collect_mode="breadth_first")
    a1 = elasticsearch_dsl.A('terms', field="city.title.keyword", size=size)
    s.aggs.bucket(country_aggs_name, a).bucket(city_aggs_name, a1)
    response = s.execute()

    data_dict = {}
    for country_hit in response.aggregations[country_aggs_name].buckets:

        x_axis = [hit.key for hit in country_hit[city_aggs_name].buckets]
        y_axis = [hit.doc_count for hit in country_hit[city_aggs_name].buckets]
        if is_need_other:
            x_axis.append("other")
            y_axis.append(country_hit[city_aggs_name].sum_other_doc_count)
        data_dict[country_hit.key] = {}
        data_dict[country_hit.key]["x_axis"] = x_axis
        data_dict[country_hit.key]["y_axis"] = y_axis

    for country in data_dict:
        x_axis = data_dict[country]["x_axis"]
        y_axis = data_dict[country]["y_axis"]
        cur_title = f"{title}\n{country}"
        figname = f"{title.replace(' ', '_')}_{country}"
        if is_need_print:
            print(cur_title)
            for i in range(len(x_axis)):
                print(f"{i + 1}\t{x_axis[i]} {y_axis[i]}")

        if is_need_plot:
            fig, ax = plt.subplots(1, 1)
            ax.set_title(cur_title)
            ax.barh(x_axis, y_axis)
            # plt.show()
            fig.savefig(f"{save_path}/{figname}.png",
                        dpi=300,
                        format='png',
                        bbox_inches='tight')
            plt.close(fig)
Exemple #29
0
def as_Q(field, query, fuzziness=1):
    return dsl.Q('match', **{field: {'query': query, 'fuzziness': fuzziness}})
    def get_monthly_cost_by_product(cls,
                                    keys,
                                    tagged=False,
                                    date_from=None,
                                    date_to=None,
                                    size=0x7FFFFFFF):
        date_from = date_from or datetime.utcnow().replace(
            day=1, hour=0, minute=0, second=0, microsecond=0)
        date_to = date_to or date_from.replace(day=calendar.monthrange(
            date_from.year, date_from.month)[1],
                                               hour=23,
                                               minute=59,
                                               second=59,
                                               microsecond=999999)
        s = cls.search()
        s = s.filter(
            'terms',
            linked_account_id=keys if isinstance(keys, list) else [keys])
        s = s.filter('range',
                     usage_start_date={
                         'from': date_from.isoformat(),
                         'to': date_to.isoformat()
                     })
        agg = s.aggs.bucket('intervals',
                            'date_histogram',
                            field='usage_start_date',
                            interval='month',
                            min_doc_count=1)
        agg = agg.bucket('products', 'terms', field='product_name', size=size)
        agg.bucket('cost', 'sum', field='cost')
        if tagged:
            agg = agg.bucket('tags', 'terms', field='tag.value')
            agg.bucket('cost', 'sum', field='cost')
        s = s.query('bool', filter=[~dsl.Q('term', cost=0)])
        res = client.search(index='awsdetailedlineitem',
                            body=s.to_dict(),
                            size=0,
                            request_timeout=60)

        def tagged_cost(bucket, total):
            total_tag = 0.0
            for tag in bucket:
                total_tag += tag['cost']['value']
                yield (tag['key'], tag['cost']['value'])
            if total != total_tag:
                yield ('untagged', total - total_tag)

        res = [{
            'month':
            interval['key_as_string'].split('T')[0],
            'products': [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
                'tags': [{
                    'name': tag[0],
                    'cost': tag[1],
                } for tag in tagged_cost(product['tags']['buckets'],
                                         product['cost']['value'])],
            } for product in interval['products']['buckets']] if tagged else [{
                'product':
                SHORT_NAMES.get(product['key'], product['key']),
                'cost':
                product['cost']['value'],
            } for product in interval['products']['buckets']]
        } for interval in res['aggregations']['intervals']['buckets']]
        return dict(months=res)