Example #1
0
    def processOpFields(self, fields, op, content):
        if self.debug:
            LOG.debug("Processing operation fields\n%s", pformat(content))
        for k, v in content.iteritems():
            # Skip anything that starts with our special char
            # Those are flag markers used by MongoDB's queries
            if k.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX):
                continue

            # We need to add the field to the collection if it doesn't
            # already exist. This will occur if this op was an aggregate,
            # which we ignore when recreating the schema
            f_type = type(v)
            if not k in fields:
                fields[k] = catalog.Collection.fieldFactory(
                    k, catalog.fieldTypeToString(f_type))
            fields[k]['query_use_count'] += 1

            # No predicate for insert operations
            # No projections for insert operations
            if op['type'] != constants.OP_TYPE_INSERT:
                # Update how this key was used with predicates
                if workload.isOpRegex(op, field=k):
                    op['predicates'][k] = constants.PRED_TYPE_REGEX
                elif isinstance(v, dict):
                    op['predicates'][k] = constants.PRED_TYPE_RANGE
                elif not k in op['predicates']:
                    op['predicates'][k] = constants.PRED_TYPE_EQUALITY

            ## TODO: Should we expect there to be field names with dot notation here?
            ##       Or should have all been cleaned out by the converters?

        ## FOR

        return
    def testIsOpRegex(self):
        op = {
            'collection': 'blah',
            'predicates': {'_id': constants.PRED_TYPE_REGEX},
            'query_aggregate': True,
            'query_content': [
                    {'#query': {'_id': {'#options': 'XXXXXXX',
                                        '#regex':   'YYYYY'}},
                     'count': 'site.songs',
                     'fields': None}],
           'query_group': None,
           'query_hash': 3563430808431869716L,
           'query_id': 579750519L,
           'query_limit': -1,
           'query_offset': 0,
           'query_size': 125,
           'query_time': 1338410992.894204,
           'resp_content': [{'n': 16, 'ok': 1}],
           'resp_id': 108641633L,
           'resp_size': 64,
           'resp_time': 1338410992.911907,
           'type': constants.OP_TYPE_QUERY,
           'update_multi': None,
           'update_upsert': None
        }

        ret = workload.isOpRegex(op)
        self.assertTrue(ret)
    def processOpFields(self, fields, op, content):
        if self.debug: LOG.debug("Processing operation fields\n%s", pformat(content))
        for k,v in content.iteritems():
            # Skip anything that starts with our special char
            # Those are flag markers used by MongoDB's queries
            if k.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX):
                continue

            # We need to add the field to the collection if it doesn't
            # already exist. This will occur if this op was an aggregate,
            # which we ignore when recreating the schema
            f_type = type(v)
            if not k in fields:
                fields[k] = catalog.Collection.fieldFactory(k, catalog.fieldTypeToString(f_type))
            fields[k]['query_use_count'] += 1

            # No predicate for insert operations
            # No projections for insert operations
            if op['type'] != constants.OP_TYPE_INSERT:
                # Update how this key was used with predicates
                if workload.isOpRegex(op, field=k):
                    op['predicates'][k] = constants.PRED_TYPE_REGEX
                elif isinstance(v, dict):
                    op['predicates'][k] = constants.PRED_TYPE_RANGE
                elif not k in op['predicates']:
                    op['predicates'][k] = constants.PRED_TYPE_EQUALITY

            ## TODO: Should we expect there to be field names with dot notation here?
            ##       Or should have all been cleaned out by the converters?

        ## FOR

        return
Example #4
0
 def __getIsOpRegex__(self, cache, op):
     isRegex = cache.op_regex.get(op["query_hash"], None)
     if isRegex is None:
         isRegex = workload.isOpRegex(op)
         if self.cache_enable:
             if self.debug: self.cache_miss_ctr.put("op_regex")
             cache.op_regex[op["query_hash"]] = isRegex
     elif self.debug:
         self.cache_hit_ctr.put("op_regex")
     return isRegex
Example #5
0
 def __getIsOpRegex__(self, cache, op):
     isRegex = cache.op_regex.get(op["query_hash"], None)
     if isRegex is None:
         isRegex = workload.isOpRegex(op)
         if self.cache_enable:
             if self.debug:
                 self.cache_miss_ctr.put("op_regex")
             cache.op_regex[op["query_hash"]] = isRegex
     elif self.debug:
         self.cache_hit_ctr.put("op_regex")
     return isRegex
    def guess_op_info(self, design, op):
        """
            Return a tuple containing the best index to use for this operation and a boolean
            flag that is true if that index covers the entire operation's query
        """
        # Simply choose the index that has most of the fields
        # referenced in the operation.
        col_name = op["collection"]
        indexes = design.getIndexes(col_name)
        op_contents = workload.getOpContents(op)
        # extract the keys from op_contents
        op_index_list = []
        for query in op_contents:
            for key in query.iterkeys():
                op_index_list.append(key)
        # add the projection keys into op_index_set
        # The op["query_fileds"] is the projection
        hasProjectionField = False
        projectionFields = op.get("query_fields", None)

        if projectionFields:
            hasProjectionField = True
            for key in projectionFields.iterkeys():
                op_index_list.append(key)

        best_index = None
        best_ratio = None
        for i in xrange(len(indexes)):
            field_cnt = 0
            for indexKey in indexes[i]:
                indexMatch = indexKey in op_index_list
                # We can't use a field if it's being used in a regex operation
                if indexMatch and not workload.isOpRegex(op, field=indexKey):
                    field_cnt += 1

                if not indexMatch or field_cnt >= len(op_index_list):
                    break
            field_ratio = field_cnt / float(len(indexes[i]))
            if not best_index or field_ratio >= best_ratio:
                # If the ratios are the same, then choose the
                # one with the most keys
                if field_ratio == best_ratio:
                    if len(indexes[i]) <= len(best_index):
                        continue

                if field_ratio != 0:
                    best_index = indexes[i]
                    best_ratio = field_ratio
            ## FOR
        if self.debug:
            LOG.debug("Op #%d - BestIndex:%s / BestRatio:%s", op["query_id"], best_index, best_ratio)

        # Check whether this is a covering index
        covering = False
        if hasProjectionField:
            if best_index and op["type"] == constants.OP_TYPE_QUERY:
                # Extract the indexes from best_index
                best_index_list = []
                for index in best_index:
                    best_index_list.append(index)

                if len(op_index_list) <= len(best_index_list):
                    counter = 0
                    while counter < len(op_index_list):
                        if op_index_list[counter] != best_index_list[counter]:
                            break
                        counter += 1

                    if counter == len(op_index_list):
                        covering = True
                ## IF
            ## IF
        ## IF

        # Get the size of the best index
        if not self.no_index_size_estimation:
            index_size = 0
            col_info = self.state.collections[col_name]
            index_size += getIndexSize(col_info, best_index)

            if col_name in self.parent_to_children_map:
                children_set = self.parent_to_children_map[col_name]
                if len(children_set) > 0:
                    for child in children_set:
                        col_info = self.state.collections[child]
                        index_size += getIndexSize(col_info, best_index)
                    ## FOR
                ## IF
            ## IF
        ## IF
        else:
            index_size = 1

        # Get the slot size of this operation
        assert not col_name in self.child_collections, (
            "collection %s should not be queried.\n child_collecitons: %s\ndesign: \n%s"
            % (col_name, self.child_collections, design)
        )
        slot_size = 0

        if col_name in self.col_cost_map:
            slot_size = int(math.ceil(self.col_cost_map[col_name]))
        else:
            slot_size = 1

        if slot_size != 1:
            slot_size *= 100

        return best_index, covering, index_size, slot_size
Example #7
0
    def guess_op_info(self, design, op):
        """
            Return a tuple containing the best index to use for this operation and a boolean
            flag that is true if that index covers the entire operation's query
        """
        # Simply choose the index that has most of the fields
        # referenced in the operation.
        col_name = op['collection']
        indexes = design.getIndexes(col_name)
        op_contents = workload.getOpContents(op)
        # extract the keys from op_contents
        op_index_list = []
        for query in op_contents:
            for key in query.iterkeys():
                op_index_list.append(key)
        # add the projection keys into op_index_set
        # The op["query_fileds"] is the projection
        hasProjectionField = False
        projectionFields = op.get('query_fields', None)

        if projectionFields:
            hasProjectionField = True
            for key in projectionFields.iterkeys():
                op_index_list.append(key)

        best_index = None
        best_ratio = None
        for i in xrange(len(indexes)):
            field_cnt = 0
            for indexKey in indexes[i]:
                indexMatch = (indexKey in op_index_list)
                # We can't use a field if it's being used in a regex operation
                if indexMatch and not workload.isOpRegex(op, field=indexKey):
                    field_cnt += 1

                if not indexMatch or field_cnt >= len(op_index_list):
                    break
            field_ratio = field_cnt / float(len(indexes[i]))
            if not best_index or field_ratio >= best_ratio:
                # If the ratios are the same, then choose the
                # one with the most keys
                if field_ratio == best_ratio:
                    if len(indexes[i]) <= len(best_index):
                        continue

                if field_ratio != 0:
                    best_index = indexes[i]
                    best_ratio = field_ratio
            ## FOR
        if self.debug:
            LOG.debug("Op #%d - BestIndex:%s / BestRatio:%s",\
                op['query_id'], best_index, best_ratio)

        # Check whether this is a covering index
        covering = False
        if hasProjectionField:
            if best_index and op['type'] == constants.OP_TYPE_QUERY:
                # Extract the indexes from best_index
                best_index_list = []
                for index in best_index:
                    best_index_list.append(index)

                if len(op_index_list) <= len(best_index_list):
                    counter = 0
                    while counter < len(op_index_list):
                        if op_index_list[counter] != best_index_list[counter]:
                            break
                        counter += 1

                    if counter == len(op_index_list):
                        covering = True
                ## IF
            ## IF
        ## IF

        # Get the size of the best index
        if not self.no_index_size_estimation:
            index_size = 0
            col_info = self.state.collections[col_name]
            index_size += getIndexSize(col_info, best_index)
            
            if col_name in self.parent_to_children_map:
                children_set = self.parent_to_children_map[col_name]
                if len(children_set) > 0:
                    for child in children_set:
                        col_info = self.state.collections[child]
                        index_size += getIndexSize(col_info, best_index)
                    ## FOR
                ## IF
            ## IF
        ## IF
        else:
            index_size = 1
        
        # Get the slot size of this operation
        assert not col_name in self.child_collections, "collection %s should not be queried.\n child_collecitons: %s\ndesign: \n%s" % (col_name, self.child_collections, design)
        slot_size = 0
        
        if col_name in self.col_cost_map:
            slot_size = int(math.ceil(self.col_cost_map[col_name])) 
        else:
            slot_size = 1
        
        if slot_size != 1:
            slot_size *= 100
            
        return best_index, covering, index_size, slot_size