Ejemplo n.º 1
0
    def count(self, query_compiler):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        # Elasticsearch _count is very efficient and so used to return results here. This means that
        # data frames that have restricted size or sort params will not return valid results
        # (_count doesn't support size).
        # Longer term we may fall back to pandas, but this may result in loading all index into memory.
        if self._size(query_params, post_processing) is not None:
            raise NotImplementedError(
                f"Requesting count with additional query and processing parameters "
                f"not supported {query_params} {post_processing}")

        # Only return requested field_names
        fields = query_compiler.get_field_names(include_scripted_fields=False)

        counts = {}
        for field in fields:
            body = Query(query_params["query"])
            body.exists(field, must=True)

            field_exists_count = query_compiler._client.count(
                index=query_compiler._index_pattern, body=body.to_count_body())
            counts[field] = field_exists_count

        return pd.Series(data=counts, index=fields)
Ejemplo n.º 2
0
    def _terms_aggs(self, query_compiler, func, es_size=None):
        """
        Parameters
        ----------
        es_size: int, default None
            Parameter used by Series.value_counts()

        Returns
        -------
        pandas.Series
            Series containing results of `func` applied to the field_name(s)
        """
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        # Get just aggregatable field_names
        aggregatable_field_names = query_compiler._mappings.aggregatable_field_names(
        )

        body = Query(query_params["query"])

        for field in aggregatable_field_names.keys():
            body.terms_aggs(field, func, field, es_size=es_size)

        response = query_compiler._client.search(
            index=query_compiler._index_pattern,
            size=0,
            body=body.to_search_body())

        results = {}

        for key in aggregatable_field_names.keys():
            # key is aggregatable field, value is label
            # e.g. key=category.keyword, value=category
            for bucket in response["aggregations"][key]["buckets"]:
                results[bucket["key"]] = bucket["doc_count"]

        try:
            # get first value in dict (key is .keyword)
            name = list(aggregatable_field_names.values())[0]
        except IndexError:
            name = None

        s = pd.Series(data=results, index=results.keys(), name=name)

        return s
Ejemplo n.º 3
0
    def _resolve_tasks(self, query_compiler):
        # We now try and combine all tasks into an Elasticsearch query
        # Some operations can be simply combined into a single query
        # other operations require pre-queries and then combinations
        # other operations require in-core post-processing of results
        query_params = {
            "query_sort_field": None,
            "query_sort_order": None,
            "query_size": None,
            "query_fields": None,
            "query_script_fields": None,
            "query": Query(),
        }

        post_processing = []

        for task in self._tasks:
            query_params, post_processing = task.resolve_task(
                query_params, post_processing, query_compiler)

        if self._arithmetic_op_fields_task is not None:
            (
                query_params,
                post_processing,
            ) = self._arithmetic_op_fields_task.resolve_task(
                query_params, post_processing, query_compiler)

        return query_params, post_processing
Ejemplo n.º 4
0
    def index_count(self, query_compiler, field):
        # field is the index field so count values
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)

        # Size is dictated by operations
        if size is not None:
            # TODO - this is not necessarily valid as the field may not exist in ALL these docs
            return size

        body = Query(query_params["query"])
        body.exists(field, must=True)

        return query_compiler._client.count(
            index=query_compiler._index_pattern, body=body.to_count_body())
Ejemplo n.º 5
0
    def info_es(self, query_compiler, buf):
        buf.write("Operations:\n")
        buf.write(f" tasks: {self._tasks}\n")

        query_params, post_processing = self._resolve_tasks(query_compiler)
        size, sort_params = Operations._query_params_to_size_and_sort(
            query_params)
        _source = query_compiler._mappings.get_field_names()

        script_fields = query_params["query_script_fields"]
        query = Query(query_params["query"])
        body = query.to_search_body()
        if script_fields is not None:
            body["script_fields"] = script_fields

        buf.write(f" size: {size}\n")
        buf.write(f" sort_params: {sort_params}\n")
        buf.write(f" _source: {_source}\n")
        buf.write(f" body: {body}\n")
        buf.write(f" post_processing: {post_processing}\n")
Ejemplo n.º 6
0
    def describe(self, query_compiler):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        numeric_source_fields = query_compiler._mappings.numeric_source_fields(
        )

        # for each field we compute:
        # count, mean, std, min, 25%, 50%, 75%, max
        body = Query(query_params["query"])

        for field in numeric_source_fields:
            body.metric_aggs("extended_stats_" + field, "extended_stats",
                             field)
            body.metric_aggs("percentiles_" + field, "percentiles", field)

        response = query_compiler._client.search(
            index=query_compiler._index_pattern,
            size=0,
            body=body.to_search_body())

        results = {}

        for field in numeric_source_fields:
            values = list()
            values.append(response["aggregations"]["extended_stats_" +
                                                   field]["count"])
            values.append(response["aggregations"]["extended_stats_" +
                                                   field]["avg"])
            values.append(response["aggregations"]["extended_stats_" +
                                                   field]["std_deviation"])
            values.append(response["aggregations"]["extended_stats_" +
                                                   field]["min"])
            values.append(response["aggregations"]["percentiles_" +
                                                   field]["values"]["25.0"])
            values.append(response["aggregations"]["percentiles_" +
                                                   field]["values"]["50.0"])
            values.append(response["aggregations"]["percentiles_" +
                                                   field]["values"]["75.0"])
            values.append(response["aggregations"]["extended_stats_" +
                                                   field]["max"])

            # if not None
            if values.count(None) < len(values):
                results[field] = values

        df = pd.DataFrame(
            data=results,
            index=["count", "mean", "std", "min", "25%", "50%", "75%", "max"],
        )

        return df
Ejemplo n.º 7
0
    def aggs(self, query_compiler, pd_aggs):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        field_names = query_compiler.get_field_names(
            include_scripted_fields=False)

        body = Query(query_params["query"])

        # convert pandas aggs to ES equivalent
        es_aggs = self._map_pd_aggs_to_es_aggs(pd_aggs)

        for field in field_names:
            for es_agg in es_aggs:
                # If we have multiple 'extended_stats' etc. here we simply NOOP on 2nd call
                if isinstance(es_agg, tuple):
                    body.metric_aggs(es_agg[0] + "_" + field, es_agg[0], field)
                else:
                    body.metric_aggs(es_agg + "_" + field, es_agg, field)

        response = query_compiler._client.search(
            index=query_compiler._index_pattern,
            size=0,
            body=body.to_search_body())
        """
        Results are like (for 'sum', 'min')

             AvgTicketPrice  DistanceKilometers  DistanceMiles  FlightDelayMin
        sum    8.204365e+06        9.261629e+07   5.754909e+07          618150
        min    1.000205e+02        0.000000e+00   0.000000e+00               0
        """
        results = {}

        for field in field_names:
            values = list()
            for es_agg in es_aggs:
                if isinstance(es_agg, tuple):
                    values.append(response["aggregations"][es_agg[0] + "_" +
                                                           field][es_agg[1]])
                else:
                    values.append(response["aggregations"][es_agg + "_" +
                                                           field]["value"])

            results[field] = values

        df = pd.DataFrame(data=results, index=pd_aggs)

        return df
Ejemplo n.º 8
0
    def index_matches_count(self, query_compiler, field, items):
        query_params, post_processing = self._validate_index_operation(
            query_compiler, items)

        body = Query(query_params["query"])

        if field == Index.ID_INDEX_FIELD:
            body.ids(items, must=True)
        else:
            body.terms(field, items, must=True)

        return query_compiler._client.count(
            index=query_compiler._index_pattern, body=body.to_count_body())
Ejemplo n.º 9
0
    def _es_results(self, query_compiler, collector):
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size, sort_params = Operations._query_params_to_size_and_sort(
            query_params)

        script_fields = query_params["query_script_fields"]
        query = Query(query_params["query"])

        body = query.to_search_body()
        if script_fields is not None:
            body["script_fields"] = script_fields

        # Only return requested field_names
        _source = query_compiler.get_field_names(include_scripted_fields=False)
        if _source:
            # For query_compiler._client.search we could add _source
            # as a parameter, or add this value in body.
            #
            # If _source is a parameter it is encoded into to the url.
            #
            # If _source is a large number of fields (1000+) then this can result in an
            # extremely long url and a `too_long_frame_exception`. Therefore, add
            # _source to the body rather than as a _source parameter
            body["_source"] = _source
        else:
            _source = False

        es_results = None

        # If size=None use scan not search - then post sort results when in df
        # If size>10000 use scan
        is_scan = False
        if size is not None and size <= DEFAULT_ES_MAX_RESULT_WINDOW:
            if size > 0:
                try:

                    es_results = query_compiler._client.search(
                        index=query_compiler._index_pattern,
                        size=size,
                        sort=sort_params,
                        body=body,
                    )
                except Exception:
                    # Catch all ES errors and print debug (currently to stdout)
                    error = {
                        "index": query_compiler._index_pattern,
                        "size": size,
                        "sort": sort_params,
                        "body": body,
                    }
                    print("Elasticsearch error:", error)
                    raise
        else:
            is_scan = True
            es_results = query_compiler._client.scan(
                index=query_compiler._index_pattern, query=body)
            # create post sort
            if sort_params is not None:
                post_processing.append(SortFieldAction(sort_params))

        if is_scan:
            while True:
                partial_result, df = query_compiler._es_results_to_pandas(
                    es_results, collector.batch_size(),
                    collector.show_progress)
                df = self._apply_df_post_processing(df, post_processing)
                collector.collect(df)
                if not partial_result:
                    break
        else:
            partial_result, df = query_compiler._es_results_to_pandas(
                es_results)
            df = self._apply_df_post_processing(df, post_processing)
            collector.collect(df)
Ejemplo n.º 10
0
    def _hist_aggs(self, query_compiler, num_bins):
        # Get histogram bins and weights for numeric field_names
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        numeric_source_fields = query_compiler._mappings.numeric_source_fields(
        )

        body = Query(query_params["query"])

        min_aggs = self._metric_aggs(query_compiler, "min", numeric_only=True)
        max_aggs = self._metric_aggs(query_compiler, "max", numeric_only=True)

        for field in numeric_source_fields:
            body.hist_aggs(field, field, min_aggs, max_aggs, num_bins)

        response = query_compiler._client.search(
            index=query_compiler._index_pattern,
            size=0,
            body=body.to_search_body())
        # results are like
        # "aggregations" : {
        #     "DistanceKilometers" : {
        #       "buckets" : [
        #         {
        #           "key" : 0.0,
        #           "doc_count" : 2956
        #         },
        #         {
        #           "key" : 1988.1482421875,
        #           "doc_count" : 768
        #         },
        #         ...

        bins = {}
        weights = {}

        # There is one more bin that weights
        # len(bins) = len(weights) + 1

        # bins = [  0.  36.  72. 108. 144. 180. 216. 252. 288. 324. 360.]
        # len(bins) == 11
        # weights = [10066.,   263.,   386.,   264.,   273.,   390.,   324.,   438.,   261.,   394.]
        # len(weights) == 10

        # ES returns
        # weights = [10066.,   263.,   386.,   264.,   273.,   390.,   324.,   438.,   261.,   252.,    142.]
        # So sum last 2 buckets
        for field in numeric_source_fields:

            # in case of series let plotting.ed_hist_series thrown an exception
            if not response.get("aggregations"):
                continue

            # in case of dataframe, throw warning that field is excluded
            if not response["aggregations"].get(field):
                warnings.warn(
                    f"{field} has no meaningful histogram interval and will be excluded. "
                    f"All values 0.",
                    UserWarning,
                )
                continue

            buckets = response["aggregations"][field]["buckets"]

            bins[field] = []
            weights[field] = []

            for bucket in buckets:
                bins[field].append(bucket["key"])

                if bucket == buckets[-1]:
                    weights[field][-1] += bucket["doc_count"]
                else:
                    weights[field].append(bucket["doc_count"])

        df_bins = pd.DataFrame(data=bins)
        df_weights = pd.DataFrame(data=weights)

        return df_bins, df_weights
Ejemplo n.º 11
0
    def _metric_aggs(
        self,
        query_compiler,
        func,
        field_types=None,
        numeric_only=None,
        keep_original_dtype=False,
    ):
        """
        Parameters
        ----------
        field_types: str, default None
            if `aggregatable` use only field_names whose fields in elasticseach are aggregatable.
            If `None`, use only numeric fields.
        keep_original_dtype : bool, default False
            if `True` the output values should keep the same domain as the input values, i.e. booleans should be booleans

        Returns
        -------
        pandas.Series
            Series containing results of `func` applied to the field_name(s)
        """
        query_params, post_processing = self._resolve_tasks(query_compiler)

        size = self._size(query_params, post_processing)
        if size is not None:
            raise NotImplementedError(
                f"Can not count field matches if size is set {size}")

        body = Query(query_params["query"])

        results = {}

        # some metrics aggs (including cardinality) work on all aggregatable fields
        # therefore we include an optional all parameter on operations
        # that call _metric_aggs
        if field_types == "aggregatable":
            aggregatable_field_names = (
                query_compiler._mappings.aggregatable_field_names())

            for field in aggregatable_field_names.keys():
                body.metric_aggs(field, func, field)

            response = query_compiler._client.search(
                index=query_compiler._index_pattern,
                size=0,
                body=body.to_search_body())

            # Results are of the form
            # "aggregations" : {
            #   "customer_full_name.keyword" : {
            #     "value" : 10
            #   }
            # }

            # map aggregatable (e.g. x.keyword) to field_name
            for key, value in aggregatable_field_names.items():
                results[value] = response["aggregations"][key]["value"]
        else:
            if numeric_only:
                (
                    pd_dtypes,
                    source_fields,
                    date_formats,
                ) = query_compiler._mappings.metric_source_fields(
                    include_bool=True)
            else:
                # The only non-numerics we support are bool and timestamps currently
                # strings are not supported by metric aggs in ES
                # TODO - sum isn't supported for Timestamp in pandas - although ES does attempt to do it
                (
                    pd_dtypes,
                    source_fields,
                    date_formats,
                ) = query_compiler._mappings.metric_source_fields(
                    include_bool=True, include_timestamp=True)

            for field in source_fields:
                body.metric_aggs(field, func, field)

            response = query_compiler._client.search(
                index=query_compiler._index_pattern,
                size=0,
                body=body.to_search_body())

            # Results are of the form
            # "aggregations" : {
            #   "AvgTicketPrice" : {
            #     "value" : 628.2536888148849
            #   },
            #   "timestamp": {
            #     "value": 1.5165624455644382E12,
            #     "value_as_string": "2018-01-21T19:20:45.564Z"
            #   }
            # }
            for pd_dtype, field, date_format in zip(pd_dtypes, source_fields,
                                                    date_formats):
                if is_datetime_or_timedelta_dtype(pd_dtype):
                    results[field] = elasticsearch_date_to_pandas_date(
                        response["aggregations"][field]["value_as_string"],
                        date_format)
                elif keep_original_dtype:
                    results[field] = pd_dtype.type(
                        response["aggregations"][field]["value"])
                else:
                    results[field] = response["aggregations"][field]["value"]

        # Return single value if this is a series
        # if len(numeric_source_fields) == 1:
        #    return np.float64(results[numeric_source_fields[0]])
        s = pd.Series(data=results, index=results.keys())

        return s
Ejemplo n.º 12
0
    def test_copy(self):
        q = Query()

        q.exists("field_a")
        q.exists("field_b", must=False)

        print(q.to_search_body())

        q1 = Query(q)

        q.exists("field_c", must=False)
        q1.exists("field_c1", must=False)

        print(q.to_search_body())
        print(q1.to_search_body())