Esempio n. 1
0
def translate_to_lucene_filter(components):
    """Translate a list of constraints on components to a lucene query.

    Take a glob components iterable. Build an equivalent Apache Lucene filter using
    hardcoded assumptions about the index schema.

    Return an Lucene filter json string.
    """
    lucene_filter = {
        "filter": {
            "type": "boolean",
            "must": []
        }
    }
    must_list = []
    globstars = components.count(GLOBSTAR)

    if globstars > 1:
        raise GlobError(
            "Contains more than one globstar (**) operator"
        )

    if globstars:
        gs_index = components.index(GLOBSTAR)
        if gs_index == len(components) - 1:
            # No define length: match on components before the GLOBSTAR
            components.pop()
            must_list = _build_filters(components)
        else:
            # GLOBSTAR is only supported at the end of a glob syntax
            raise GlobError(
                "Metric pattern syntax only supports '%s' at the end" % GLOBSTAR)
    elif (  # Parent query
            len(components) > 1
            and
            all(len(c) == 1 and glob_utils.is_fixed_sequence(
                c[0]) for c in components[:-1])
            and
            isinstance(components[-1][0], glob_utils.AnySequence)
    ):
        parent = ""
        for component in components[:-1]:
            parent += component[0] + "."
        must_list.append(
            {"field": "parent", "type": "match", "value": parent})
    else:
        must_list = _build_filters(components)
        # Restrict length by matching the END_MARK
        must_list.append(
            {"field": _component_name(len(components)), "type": "match", "value": END_MARK})

    if not must_list:
        return None

    lucene_filter["filter"]["must"] = must_list

    # Join the constraints (with a nice indentation)
    return json.dumps(lucene_filter)
Esempio n. 2
0
def translate_to_lucene_filter(components):
    """Translate a list of constraints on components to a lucene query.

    Take a glob components iterable. Build an equivalent Apache Lucene filter using
    hardcoded assumptions about the index schema.

    Return an Lucene filter json string.
    """
    lucene_filter = {"filter": {"type": "boolean", "must": []}}
    must_list = []
    globstars = components.count(GLOBSTAR)

    if globstars > 1:
        raise GlobError("Contains more than one globstar (**) operator")

    if globstars:
        gs_index = components.index(GLOBSTAR)
        if gs_index == len(components) - 1:
            # No define length: match on components before the GLOBSTAR
            components.pop()
            must_list = _build_filters(components)
        else:
            # GLOBSTAR is only supported at the end of a glob syntax
            raise GlobError(
                "Metric pattern syntax only supports '%s' at the end" % GLOBSTAR
            )
    elif (  # Parent query
        len(components) > 1
        and all(
            len(c) == 1 and glob_utils.is_fixed_sequence(c[0]) for c in components[:-1]
        )
        and isinstance(components[-1][0], glob_utils.AnySequence)
    ):
        parent = ""
        for component in components[:-1]:
            parent += component[0] + "."
        must_list.append({"field": "parent", "type": "match", "value": parent})
    else:
        must_list = _build_filters(components)
        # Restrict length by matching the END_MARK
        must_list.append(
            {
                "field": _component_name(len(components)),
                "type": "match",
                "value": END_MARK,
            }
        )

    if not must_list:
        return None

    lucene_filter["filter"]["must"] = must_list

    # Join the constraints (with a nice indentation)
    return json.dumps(lucene_filter)
Esempio n. 3
0
def translate_to_lucene_filter(components):
    """Translate a list of constraints on components to a lucene query.

    Take a glob components iterable. Build an equivalent Apache Lucene filter using
    hardcoded assumptions about the index schema.

    Return an Lucene filter json string.
    """
    must_list = []

    globstars = components.count(GLOBSTAR)

    if globstars > 1:
        raise GlobError("Contains more than one globstar (**) operator")

    if globstars:
        gs_index = components.index(GLOBSTAR)
        if gs_index == len(components) - 1:
            # No define length: match on components before the GLOBSTAR
            components.pop()
            must_list = _build_filters(components)
        else:
            # GLOBSTAR is only supported at the end of a glob syntax
            raise GlobError(
                "Metric pattern syntax only supports '%s' at the end" %
                GLOBSTAR)
    elif (  # Parent query
            len(components) > 1 and all(
                len(c) == 1 and glob_utils.is_fixed_sequence(c[0])
                for c in components[:-1])
            and isinstance(components[-1][0], glob_utils.AnySequence)):
        parent = ""
        for component in components[:-1]:
            parent += component[0] + "."
        must_list.append(FIELD_MATCH_VALUE % ('parent', parent))
    else:
        must_list = _build_filters(components)
        # Restrict length by matching the END_MARK
        must_list.append(FIELD_MATCH_VALUE %
                         (_component_name(len(components)), END_MARK))

    if not must_list:
        return None

    # Join the constraints (with a nice indentation)
    return LUCENE_FILTER % ",\n            ".join(must_list)
Esempio n. 4
0
    def __generate_normal_names_queries(self, table, components):
        # Only keep the component parts that enable us to build prefix queries.
        # This means any uninterrupted sequence of strings or braces selectors.
        # On the way, we keep the position and value counts of selectors for
        # further query simplification.
        idxlens = []
        combinations = 1
        for cidx, component in enumerate(components):
            entry = []
            end = 0
            for pidx, part in enumerate(component):
                if isinstance(part, bg_glob.SequenceIn):
                    count = len(part.values)
                    combinations *= count
                    entry.append((pidx, count))
                elif not bg_glob.is_fixed_sequence(part):
                    # If we have globs we can't do much more.
                    break

                end = pidx + 1

            idxlens.append(entry)
            simplified_component = component[:end]
            if len(simplified_component) < len(component):
                simplified_component.append(ANYSEQUENCE)

            components[cidx] = simplified_component

        # Skip any additional work if we have a basic query.
        if combinations == 1:
            return [self.__build_select_names_query(table, components)]

        # Reduce complexity by dropping the rightmost selector from each
        # component, starting with the shallowest component, until we have a low
        # enough combination count.
        for cidx, entry in enumerate(idxlens):
            if combinations <= self.max_queries_per_pattern:
                break

            while len(
                    entry) > 0 and combinations > self.max_queries_per_pattern:
                component = components[cidx]
                idx, count = entry.pop()

                surrounding_anyseqs = 0
                if idx > 0 and component[idx - 1] == ANYSEQUENCE:
                    surrounding_anyseqs += 1
                if idx < len(component) - 1 and component[idx +
                                                          1] == ANYSEQUENCE:
                    surrounding_anyseqs += 1

                # If we have surrounding AnySeqs, then drop elements so that
                # only one remains. Otherwise, replace current part with AnySeq.
                if surrounding_anyseqs > 0:
                    while surrounding_anyseqs > 0:
                        del (component[idx])
                        surrounding_anyseqs -= 1
                else:
                    component[idx] = ANYSEQUENCE

                combinations /= count

        # Pre-compute all possible values for each component.
        for cidx, component in enumerate(components):
            suffix = []
            if component[-1] == ANYSEQUENCE:
                if len(component) == 1:
                    components[cidx] = [component]
                    continue
                else:
                    suffix.append(ANYSEQUENCE)

            values = ['']
            for part in component:
                if bg_glob.is_fixed_sequence(part):
                    values = [x + part for x in values]
                elif isinstance(part, bg_glob.SequenceIn):
                    values = [x + y for x in values for y in part.values]
                else:
                    break

            components[cidx] = [[x] + suffix for x in values]

        # Generate queries using the combinations of pre-computed values for the
        # components.
        return [
            self.__build_select_names_query(table, combination)
            for combination in itertools.product(*components)
        ]
Esempio n. 5
0
    def __build_select_names_query(self, table, components):
        query_select = "SELECT name FROM \"%s\".\"%s\"" % (
            self.keyspace_metadata,
            table,
        )
        query_limit = "LIMIT %d" % (self.max_metrics_per_pattern + 1)

        if len(components) == 0:
            return "%s %s;" % (query_select, query_limit)

        # If all components are constant values we can search by exact name.
        # If all but the last component are constant values we can search by
        # exact parent, in which case we may benefit from filtering the last
        # component by prefix when we have one. (Code refers to the previous-to
        # -last component because of the __END__ suffix we use).
        #
        # We are not using prefix search on the parent because it appears to be
        # too slow/costly at the moment (see #174 for details).
        if (components[-1] == [_LAST_COMPONENT] and  # Not a prefix globstar
                all(
                    len(c) == 1 and bg_glob.is_fixed_sequence(c[0])
                    for c in components[:-2])):
            last = components[-2]
            if len(last) == 1 and bg_glob.is_fixed_sequence(last[0]):
                # XXX(d.forest): do not try to optimize by passing the raw glob
                #                and using it here; because this is invalid in
                #                cases where the glob contains braces.
                name = DIRECTORY_SEPARATOR.join(
                    itertools.chain.from_iterable(components[:-1]))
                return "%s WHERE name = %s %s;" % (
                    query_select,
                    c_encoder.cql_quote(name),
                    query_limit,
                )
            else:
                if len(last) > 0 and bg_glob.is_fixed_sequence(last[0]):
                    prefix_filter = "AND component_%d LIKE %s" % (
                        len(components) - 2,
                        c_encoder.cql_quote(last[0] + '%'),
                    )
                    allow_filtering = "ALLOW FILTERING"
                else:
                    prefix_filter = ''
                    allow_filtering = ''

                parent = itertools.chain.from_iterable(components[:-2])
                parent = DIRECTORY_SEPARATOR.join(parent) + DIRECTORY_SEPARATOR
                return "%s WHERE parent = %s %s %s %s;" % (
                    query_select,
                    c_encoder.cql_quote(parent),
                    prefix_filter,
                    query_limit,
                    allow_filtering,
                )

        where_clauses = []

        for n, component in enumerate(components):
            if len(component) == 0:
                continue

            # We are currently using prefix indexes, so if we do not have a
            # prefix value (i.e. it is a wildcard), then the current component
            # cannot be constrained inside the request.
            value = component[0]
            if not bg_glob.is_fixed_sequence(value):
                continue

            if len(component) == 1:
                op = '='
            else:
                op = "LIKE"
                value += '%'

            clause = "component_%d %s %s" % (n, op, c_encoder.cql_quote(value))
            where_clauses.append(clause)

        if len(where_clauses) == 0:
            return "%s %s;" % (query_select, query_limit)

        return "%s WHERE %s %s ALLOW FILTERING;" % (
            query_select, " AND ".join(where_clauses), query_limit)
Esempio n. 6
0
    def __generate_normal_names_queries(self, table, components):
        # Only keep the component parts that enable us to build prefix queries.
        # This means any uninterrupted sequence of strings or braces selectors.
        # On the way, we keep the position and value counts of selectors for
        # further query simplification.
        idxlens = []
        combinations = 1
        for cidx, component in enumerate(components):
            entry = []
            end = 0
            for pidx, part in enumerate(component):
                if isinstance(part, bg_glob.SequenceIn):
                    count = len(part.values)
                    combinations *= count
                    entry.append((pidx, count))
                elif not bg_glob.is_fixed_sequence(part):
                    # If we have globs we can't do much more.
                    break

                end = pidx + 1

            idxlens.append(entry)
            simplified_component = component[:end]
            if len(simplified_component) < len(component):
                simplified_component.append(ANYSEQUENCE)

            components[cidx] = simplified_component

        # Skip any additional work if we have a basic query.
        if combinations == 1:
            return [self.__build_select_names_query(table, components)]

        # Reduce complexity by dropping the rightmost selector from each
        # component, starting with the shallowest component, until we have a low
        # enough combination count.
        for cidx, entry in enumerate(idxlens):
            if combinations <= self.max_queries_per_pattern:
                break

            while len(entry) > 0 and combinations > self.max_queries_per_pattern:
                component = components[cidx]
                idx, count = entry.pop()

                surrounding_anyseqs = 0
                if idx > 0 and component[idx - 1] == ANYSEQUENCE:
                    surrounding_anyseqs += 1
                if idx < len(component) - 1 and component[idx + 1] == ANYSEQUENCE:
                    surrounding_anyseqs += 1

                # If we have surrounding AnySeqs, then drop elements so that
                # only one remains. Otherwise, replace current part with AnySeq.
                if surrounding_anyseqs > 0:
                    while surrounding_anyseqs > 0:
                        del (component[idx])
                        surrounding_anyseqs -= 1
                else:
                    component[idx] = ANYSEQUENCE

                combinations /= count

        # Pre-compute all possible values for each component.
        for cidx, component in enumerate(components):
            suffix = []
            if component[-1] == ANYSEQUENCE:
                if len(component) == 1:
                    components[cidx] = [component]
                    continue
                else:
                    suffix.append(ANYSEQUENCE)

            values = [""]
            for part in component:
                if bg_glob.is_fixed_sequence(part):
                    values = [x + part for x in values]
                elif isinstance(part, bg_glob.SequenceIn):
                    values = [x + y for x in values for y in part.values]
                else:
                    break

            components[cidx] = [[x] + suffix for x in values]

        # Generate queries using the combinations of pre-computed values for the
        # components.
        return [
            self.__build_select_names_query(table, combination)
            for combination in itertools.product(*components)
        ]
Esempio n. 7
0
    def __build_select_names_query(self, table, components):
        query_select = 'SELECT name FROM "%s"."%s"' % (self.keyspace_metadata, table)
        query_limit = "LIMIT %d" % (self.max_metrics_per_pattern + 1)

        if len(components) == 0:
            return "%s %s;" % (query_select, query_limit)

        # If all components are constant values we can search by exact name.
        # If all but the last component are constant values we can search by
        # exact parent, in which case we may benefit from filtering the last
        # component by prefix when we have one. (Code refers to the previous-to
        # -last component because of the __END__ suffix we use).
        #
        # We are not using prefix search on the parent because it appears to be
        # too slow/costly at the moment (see #174 for details).
        if components[-1] == [_LAST_COMPONENT] and all(  # Not a prefix globstar
            len(c) == 1 and bg_glob.is_fixed_sequence(c[0]) for c in components[:-2]
        ):
            last = components[-2]
            if len(last) == 1 and bg_glob.is_fixed_sequence(last[0]):
                # XXX(d.forest): do not try to optimize by passing the raw glob
                #                and using it here; because this is invalid in
                #                cases where the glob contains braces.
                name = DIRECTORY_SEPARATOR.join(
                    itertools.chain.from_iterable(components[:-1])
                )
                return "%s WHERE name = %s %s;" % (
                    query_select,
                    c_encoder.cql_quote(name),
                    query_limit,
                )
            else:
                if len(last) > 0 and bg_glob.is_fixed_sequence(last[0]):
                    prefix_filter = "AND component_%d LIKE %s" % (
                        len(components) - 2,
                        c_encoder.cql_quote(last[0] + "%"),
                    )
                    allow_filtering = "ALLOW FILTERING"
                else:
                    prefix_filter = ""
                    allow_filtering = ""

                parent = itertools.chain.from_iterable(components[:-2])
                parent = DIRECTORY_SEPARATOR.join(parent) + DIRECTORY_SEPARATOR
                return "%s WHERE parent = %s %s %s %s;" % (
                    query_select,
                    c_encoder.cql_quote(parent),
                    prefix_filter,
                    query_limit,
                    allow_filtering,
                )

        where_clauses = []

        for n, component in enumerate(components):
            if len(component) == 0:
                continue

            # We are currently using prefix indexes, so if we do not have a
            # prefix value (i.e. it is a wildcard), then the current component
            # cannot be constrained inside the request.
            value = component[0]
            if not bg_glob.is_fixed_sequence(value):
                continue

            if len(component) == 1:
                op = "="
            else:
                op = "LIKE"
                value += "%"

            clause = "component_%d %s %s" % (n, op, c_encoder.cql_quote(value))
            where_clauses.append(clause)

        if len(where_clauses) == 0:
            return "%s %s;" % (query_select, query_limit)

        return "%s WHERE %s %s ALLOW FILTERING;" % (
            query_select,
            " AND ".join(where_clauses),
            query_limit,
        )