def translate_to_lucene_filter(components): """Translate a list of constraints on components to a lucene query. Take a glob components iterable. Build an equivalent Apache Lucene filter using hardcoded assumptions about the index schema. Return an Lucene filter json string. """ lucene_filter = { "filter": { "type": "boolean", "must": [] } } must_list = [] globstars = components.count(GLOBSTAR) if globstars > 1: raise GlobError( "Contains more than one globstar (**) operator" ) if globstars: gs_index = components.index(GLOBSTAR) if gs_index == len(components) - 1: # No define length: match on components before the GLOBSTAR components.pop() must_list = _build_filters(components) else: # GLOBSTAR is only supported at the end of a glob syntax raise GlobError( "Metric pattern syntax only supports '%s' at the end" % GLOBSTAR) elif ( # Parent query len(components) > 1 and all(len(c) == 1 and glob_utils.is_fixed_sequence( c[0]) for c in components[:-1]) and isinstance(components[-1][0], glob_utils.AnySequence) ): parent = "" for component in components[:-1]: parent += component[0] + "." must_list.append( {"field": "parent", "type": "match", "value": parent}) else: must_list = _build_filters(components) # Restrict length by matching the END_MARK must_list.append( {"field": _component_name(len(components)), "type": "match", "value": END_MARK}) if not must_list: return None lucene_filter["filter"]["must"] = must_list # Join the constraints (with a nice indentation) return json.dumps(lucene_filter)
def translate_to_lucene_filter(components): """Translate a list of constraints on components to a lucene query. Take a glob components iterable. Build an equivalent Apache Lucene filter using hardcoded assumptions about the index schema. Return an Lucene filter json string. """ lucene_filter = {"filter": {"type": "boolean", "must": []}} must_list = [] globstars = components.count(GLOBSTAR) if globstars > 1: raise GlobError("Contains more than one globstar (**) operator") if globstars: gs_index = components.index(GLOBSTAR) if gs_index == len(components) - 1: # No define length: match on components before the GLOBSTAR components.pop() must_list = _build_filters(components) else: # GLOBSTAR is only supported at the end of a glob syntax raise GlobError( "Metric pattern syntax only supports '%s' at the end" % GLOBSTAR ) elif ( # Parent query len(components) > 1 and all( len(c) == 1 and glob_utils.is_fixed_sequence(c[0]) for c in components[:-1] ) and isinstance(components[-1][0], glob_utils.AnySequence) ): parent = "" for component in components[:-1]: parent += component[0] + "." must_list.append({"field": "parent", "type": "match", "value": parent}) else: must_list = _build_filters(components) # Restrict length by matching the END_MARK must_list.append( { "field": _component_name(len(components)), "type": "match", "value": END_MARK, } ) if not must_list: return None lucene_filter["filter"]["must"] = must_list # Join the constraints (with a nice indentation) return json.dumps(lucene_filter)
def translate_to_lucene_filter(components): """Translate a list of constraints on components to a lucene query. Take a glob components iterable. Build an equivalent Apache Lucene filter using hardcoded assumptions about the index schema. Return an Lucene filter json string. """ must_list = [] globstars = components.count(GLOBSTAR) if globstars > 1: raise GlobError("Contains more than one globstar (**) operator") if globstars: gs_index = components.index(GLOBSTAR) if gs_index == len(components) - 1: # No define length: match on components before the GLOBSTAR components.pop() must_list = _build_filters(components) else: # GLOBSTAR is only supported at the end of a glob syntax raise GlobError( "Metric pattern syntax only supports '%s' at the end" % GLOBSTAR) elif ( # Parent query len(components) > 1 and all( len(c) == 1 and glob_utils.is_fixed_sequence(c[0]) for c in components[:-1]) and isinstance(components[-1][0], glob_utils.AnySequence)): parent = "" for component in components[:-1]: parent += component[0] + "." must_list.append(FIELD_MATCH_VALUE % ('parent', parent)) else: must_list = _build_filters(components) # Restrict length by matching the END_MARK must_list.append(FIELD_MATCH_VALUE % (_component_name(len(components)), END_MARK)) if not must_list: return None # Join the constraints (with a nice indentation) return LUCENE_FILTER % ",\n ".join(must_list)
def __generate_normal_names_queries(self, table, components): # Only keep the component parts that enable us to build prefix queries. # This means any uninterrupted sequence of strings or braces selectors. # On the way, we keep the position and value counts of selectors for # further query simplification. idxlens = [] combinations = 1 for cidx, component in enumerate(components): entry = [] end = 0 for pidx, part in enumerate(component): if isinstance(part, bg_glob.SequenceIn): count = len(part.values) combinations *= count entry.append((pidx, count)) elif not bg_glob.is_fixed_sequence(part): # If we have globs we can't do much more. break end = pidx + 1 idxlens.append(entry) simplified_component = component[:end] if len(simplified_component) < len(component): simplified_component.append(ANYSEQUENCE) components[cidx] = simplified_component # Skip any additional work if we have a basic query. if combinations == 1: return [self.__build_select_names_query(table, components)] # Reduce complexity by dropping the rightmost selector from each # component, starting with the shallowest component, until we have a low # enough combination count. for cidx, entry in enumerate(idxlens): if combinations <= self.max_queries_per_pattern: break while len( entry) > 0 and combinations > self.max_queries_per_pattern: component = components[cidx] idx, count = entry.pop() surrounding_anyseqs = 0 if idx > 0 and component[idx - 1] == ANYSEQUENCE: surrounding_anyseqs += 1 if idx < len(component) - 1 and component[idx + 1] == ANYSEQUENCE: surrounding_anyseqs += 1 # If we have surrounding AnySeqs, then drop elements so that # only one remains. Otherwise, replace current part with AnySeq. if surrounding_anyseqs > 0: while surrounding_anyseqs > 0: del (component[idx]) surrounding_anyseqs -= 1 else: component[idx] = ANYSEQUENCE combinations /= count # Pre-compute all possible values for each component. for cidx, component in enumerate(components): suffix = [] if component[-1] == ANYSEQUENCE: if len(component) == 1: components[cidx] = [component] continue else: suffix.append(ANYSEQUENCE) values = [''] for part in component: if bg_glob.is_fixed_sequence(part): values = [x + part for x in values] elif isinstance(part, bg_glob.SequenceIn): values = [x + y for x in values for y in part.values] else: break components[cidx] = [[x] + suffix for x in values] # Generate queries using the combinations of pre-computed values for the # components. return [ self.__build_select_names_query(table, combination) for combination in itertools.product(*components) ]
def __build_select_names_query(self, table, components): query_select = "SELECT name FROM \"%s\".\"%s\"" % ( self.keyspace_metadata, table, ) query_limit = "LIMIT %d" % (self.max_metrics_per_pattern + 1) if len(components) == 0: return "%s %s;" % (query_select, query_limit) # If all components are constant values we can search by exact name. # If all but the last component are constant values we can search by # exact parent, in which case we may benefit from filtering the last # component by prefix when we have one. (Code refers to the previous-to # -last component because of the __END__ suffix we use). # # We are not using prefix search on the parent because it appears to be # too slow/costly at the moment (see #174 for details). if (components[-1] == [_LAST_COMPONENT] and # Not a prefix globstar all( len(c) == 1 and bg_glob.is_fixed_sequence(c[0]) for c in components[:-2])): last = components[-2] if len(last) == 1 and bg_glob.is_fixed_sequence(last[0]): # XXX(d.forest): do not try to optimize by passing the raw glob # and using it here; because this is invalid in # cases where the glob contains braces. name = DIRECTORY_SEPARATOR.join( itertools.chain.from_iterable(components[:-1])) return "%s WHERE name = %s %s;" % ( query_select, c_encoder.cql_quote(name), query_limit, ) else: if len(last) > 0 and bg_glob.is_fixed_sequence(last[0]): prefix_filter = "AND component_%d LIKE %s" % ( len(components) - 2, c_encoder.cql_quote(last[0] + '%'), ) allow_filtering = "ALLOW FILTERING" else: prefix_filter = '' allow_filtering = '' parent = itertools.chain.from_iterable(components[:-2]) parent = DIRECTORY_SEPARATOR.join(parent) + DIRECTORY_SEPARATOR return "%s WHERE parent = %s %s %s %s;" % ( query_select, c_encoder.cql_quote(parent), prefix_filter, query_limit, allow_filtering, ) where_clauses = [] for n, component in enumerate(components): if len(component) == 0: continue # We are currently using prefix indexes, so if we do not have a # prefix value (i.e. it is a wildcard), then the current component # cannot be constrained inside the request. value = component[0] if not bg_glob.is_fixed_sequence(value): continue if len(component) == 1: op = '=' else: op = "LIKE" value += '%' clause = "component_%d %s %s" % (n, op, c_encoder.cql_quote(value)) where_clauses.append(clause) if len(where_clauses) == 0: return "%s %s;" % (query_select, query_limit) return "%s WHERE %s %s ALLOW FILTERING;" % ( query_select, " AND ".join(where_clauses), query_limit)
def __generate_normal_names_queries(self, table, components): # Only keep the component parts that enable us to build prefix queries. # This means any uninterrupted sequence of strings or braces selectors. # On the way, we keep the position and value counts of selectors for # further query simplification. idxlens = [] combinations = 1 for cidx, component in enumerate(components): entry = [] end = 0 for pidx, part in enumerate(component): if isinstance(part, bg_glob.SequenceIn): count = len(part.values) combinations *= count entry.append((pidx, count)) elif not bg_glob.is_fixed_sequence(part): # If we have globs we can't do much more. break end = pidx + 1 idxlens.append(entry) simplified_component = component[:end] if len(simplified_component) < len(component): simplified_component.append(ANYSEQUENCE) components[cidx] = simplified_component # Skip any additional work if we have a basic query. if combinations == 1: return [self.__build_select_names_query(table, components)] # Reduce complexity by dropping the rightmost selector from each # component, starting with the shallowest component, until we have a low # enough combination count. for cidx, entry in enumerate(idxlens): if combinations <= self.max_queries_per_pattern: break while len(entry) > 0 and combinations > self.max_queries_per_pattern: component = components[cidx] idx, count = entry.pop() surrounding_anyseqs = 0 if idx > 0 and component[idx - 1] == ANYSEQUENCE: surrounding_anyseqs += 1 if idx < len(component) - 1 and component[idx + 1] == ANYSEQUENCE: surrounding_anyseqs += 1 # If we have surrounding AnySeqs, then drop elements so that # only one remains. Otherwise, replace current part with AnySeq. if surrounding_anyseqs > 0: while surrounding_anyseqs > 0: del (component[idx]) surrounding_anyseqs -= 1 else: component[idx] = ANYSEQUENCE combinations /= count # Pre-compute all possible values for each component. for cidx, component in enumerate(components): suffix = [] if component[-1] == ANYSEQUENCE: if len(component) == 1: components[cidx] = [component] continue else: suffix.append(ANYSEQUENCE) values = [""] for part in component: if bg_glob.is_fixed_sequence(part): values = [x + part for x in values] elif isinstance(part, bg_glob.SequenceIn): values = [x + y for x in values for y in part.values] else: break components[cidx] = [[x] + suffix for x in values] # Generate queries using the combinations of pre-computed values for the # components. return [ self.__build_select_names_query(table, combination) for combination in itertools.product(*components) ]
def __build_select_names_query(self, table, components): query_select = 'SELECT name FROM "%s"."%s"' % (self.keyspace_metadata, table) query_limit = "LIMIT %d" % (self.max_metrics_per_pattern + 1) if len(components) == 0: return "%s %s;" % (query_select, query_limit) # If all components are constant values we can search by exact name. # If all but the last component are constant values we can search by # exact parent, in which case we may benefit from filtering the last # component by prefix when we have one. (Code refers to the previous-to # -last component because of the __END__ suffix we use). # # We are not using prefix search on the parent because it appears to be # too slow/costly at the moment (see #174 for details). if components[-1] == [_LAST_COMPONENT] and all( # Not a prefix globstar len(c) == 1 and bg_glob.is_fixed_sequence(c[0]) for c in components[:-2] ): last = components[-2] if len(last) == 1 and bg_glob.is_fixed_sequence(last[0]): # XXX(d.forest): do not try to optimize by passing the raw glob # and using it here; because this is invalid in # cases where the glob contains braces. name = DIRECTORY_SEPARATOR.join( itertools.chain.from_iterable(components[:-1]) ) return "%s WHERE name = %s %s;" % ( query_select, c_encoder.cql_quote(name), query_limit, ) else: if len(last) > 0 and bg_glob.is_fixed_sequence(last[0]): prefix_filter = "AND component_%d LIKE %s" % ( len(components) - 2, c_encoder.cql_quote(last[0] + "%"), ) allow_filtering = "ALLOW FILTERING" else: prefix_filter = "" allow_filtering = "" parent = itertools.chain.from_iterable(components[:-2]) parent = DIRECTORY_SEPARATOR.join(parent) + DIRECTORY_SEPARATOR return "%s WHERE parent = %s %s %s %s;" % ( query_select, c_encoder.cql_quote(parent), prefix_filter, query_limit, allow_filtering, ) where_clauses = [] for n, component in enumerate(components): if len(component) == 0: continue # We are currently using prefix indexes, so if we do not have a # prefix value (i.e. it is a wildcard), then the current component # cannot be constrained inside the request. value = component[0] if not bg_glob.is_fixed_sequence(value): continue if len(component) == 1: op = "=" else: op = "LIKE" value += "%" clause = "component_%d %s %s" % (n, op, c_encoder.cql_quote(value)) where_clauses.append(clause) if len(where_clauses) == 0: return "%s %s;" % (query_select, query_limit) return "%s WHERE %s %s ALLOW FILTERING;" % ( query_select, " AND ".join(where_clauses), query_limit, )