コード例 #1
0
    def setUp(self):
        # Create a fake Collection catalog entry
        # WORKLOAD
        self.col_info = catalog.Collection()
        self.col_info['name'] = COLLECTION_NAME
        self.col_info['doc_count'] = NUM_DOCUMENTS
        self.col_info['workload_queries'] = 1000
        self.col_info['workload_percent'] = 1.0

        for f in xrange(NUM_FIELDS+1):
            # We always need the _id field
            if not f:
                f_name = "_id"
                f_type = catalog.fieldTypeToString(int)
                f_size = catalog.getEstimatedSize(f_type, 10000)
            else:
                f_name = "field%02d" % f
                if f % 2 == 0:
                    f_type = catalog.fieldTypeToString(long)
                    f_size = catalog.getEstimatedSize(f_type, 10000000l)
                else:
                    f_type = catalog.fieldTypeToString(str)
                    f_size = 128

            f = catalog.Collection.fieldFactory(f_name, f_type)
            f['avg_size'] = f_size
            f['query_use_count'] = self.col_info['workload_queries']
            self.col_info['fields'][f_name] = f
            self.col_info['interesting'].append(f_name)
            self.col_info['avg_doc_size'] += f_size
        ## FOR (field)

        self.design = Design()
        self.design.addCollection(self.col_info['name'])
        self.design.addIndex(self.col_info['name'], ["_id"])
        self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3])

        self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE)
        self.buffer.initialize(self.design)
コード例 #2
0
    def setUp(self):
        # Create a fake Collection catalog entry
        # WORKLOAD
        self.col_info = catalog.Collection()
        self.col_info['name'] = COLLECTION_NAME
        self.col_info['doc_count'] = NUM_DOCUMENTS
        self.col_info['workload_queries'] = 1000
        self.col_info['workload_percent'] = 1.0

        for f in xrange(NUM_FIELDS + 1):
            # We always need the _id field
            if not f:
                f_name = "_id"
                f_type = catalog.fieldTypeToString(int)
                f_size = catalog.getEstimatedSize(f_type, 10000)
            else:
                f_name = "field%02d" % f
                if f % 2 == 0:
                    f_type = catalog.fieldTypeToString(long)
                    f_size = catalog.getEstimatedSize(f_type, 10000000l)
                else:
                    f_type = catalog.fieldTypeToString(str)
                    f_size = 128

            f = catalog.Collection.fieldFactory(f_name, f_type)
            f['avg_size'] = f_size
            f['query_use_count'] = self.col_info['workload_queries']
            self.col_info['fields'][f_name] = f
            self.col_info['interesting'].append(f_name)
            self.col_info['avg_doc_size'] += f_size
        ## FOR (field)

        self.design = Design()
        self.design.addCollection(self.col_info['name'])
        self.design.addIndex(self.col_info['name'], ["_id"])
        self.design.addIndex(self.col_info['name'],
                             self.col_info['interesting'][1:3])

        self.buffer = LRUBuffer({self.col_info['name']: self.col_info},
                                BUFFER_SIZE)
        self.buffer.initialize(self.design)
コード例 #3
0
    def processDataFields(self, col_info, fields, doc):
        """
            Recursively traverse a single document and extract out the field information
        """
        if self.debug:
            LOG.debug("Extracting fields for document:\n%s" % pformat(doc))

        # Check if the current doc has parent_col, but this will only apply to its fields
        parent_col = doc.get('parent_col', None)

        for k, v in doc.iteritems():
            # Skip if this is the _id field
            if constants.SKIP_MONGODB_ID_FIELD and k == '_id': continue
            if k == constants.FUNCTIONAL_FIELD: continue
            f_type = type(v)
            f_type_str = catalog.fieldTypeToString(f_type)

            if not k in fields:
                # This is only subset of what we will compute for each field
                # See catalog.Collection for more information
                if self.debug:
                    LOG.debug("Creating new field entry for '%s'" % k)
                fields[k] = catalog.Collection.fieldFactory(k, f_type_str)
            else:
                fields[k]['type'] = f_type_str
                # Sanity check
                # This won't work if the data is not uniform
                #if v != None:
                #assert fields[k]['type'] == f_type_str, \
                #"Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k)

            # We will store the distinct values for each field in a set
            # that is embedded in the field. We will delete it when
            # we call computeFieldStats()
            if not 'distinct_values' in fields[k]:
                fields[k]['distinct_values'] = set()
            if not "num_values" in fields[k]:
                fields[k]['num_values'] = 0
            # Likewise, we will also store a histogram for the different sizes
            # of each field. We will use this later on to compute the weighted average
            if not 'size_histogram' in fields[k]:
                fields[k]['size_histogram'] = Histogram()
            # Maintain a histogram of list lengths
            if not 'list_len' in fields[k]:
                fields[k]['list_len'] = Histogram()

            if fields[k]['query_use_count'] > 0 and not k in col_info[
                    'interesting']:
                col_info['interesting'].append(k)

            ## ----------------------------------------------
            ## NESTED FIELDS
            ## ----------------------------------------------
            if isinstance(v, dict):
                # Check for a special data field
                if len(v) == 1 and v.keys()[0].startswith(
                        constants.REPLACE_KEY_DOLLAR_PREFIX):
                    v = v[v.keys()[0]]
                    # HACK to handle lists (hopefully dict as well)from nested IN clauses...
                    all_values = v if isinstance(v, list) else [v]
                    for v in all_values:
                        if isinstance(v, dict):
                            v = v.values()[0]

                        fields[k]['type'] = catalog.fieldTypeToString(type(v))
                        try:
                            size = catalog.getEstimatedSize(
                                fields[k]['type'], v)
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error("Failed to estimate size for field '%s' in collection '%s'\n%s", \
                                    k, col_info['name'], pformat(fields[k]))
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]",
                                self.total_field_ctr, self.err_field_ctr)
                            continue
                        col_info['data_size'] += size
                        fields[k]['size_histogram'].put(size)
                        fields[k]['distinct_values'].add(v)
                        fields[k]['num_values'] += 1
                        if parent_col:
                            fields[k]['parent_col'] = parent_col
                    ## FOR
                else:
                    if self.debug:
                        LOG.debug("Extracting keys in nested field for '%s'" %
                                  k)
                    if not 'fields' in fields[k]: fields[k]['fields'] = {}
                    self.processDataFields(col_info, fields[k]['fields'],
                                           doc[k])

            ## ----------------------------------------------
            ## LIST OF VALUES
            ## Could be either scalars or dicts. If it's a dict, then we'll just
            ## store the nested field information in the 'fields' value
            ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to
            ## store the field information for the inner values.
            ## ----------------------------------------------
            elif isinstance(v, list):
                if self.debug:
                    LOG.debug("Extracting keys in nested list for '%s'" % k)
                if not 'fields' in fields[k]: fields[k]['fields'] = {}

                list_len = len(doc[k])
                fields[k]['list_len'].put(list_len)
                for i in xrange(list_len):
                    inner_type = type(doc[k][i])
                    # More nested documents...
                    if inner_type == dict:
                        if self.debug:
                            LOG.debug(
                                "Extracting keys in nested field in list position %d for '%s'"
                                % (i, k))
                        self.processDataFields(col_info, fields[k]['fields'],
                                               doc[k][i])
                    else:
                        # TODO: We probably should store a list of types here in case
                        #       the list has different types of values
                        inner = fields[k]['fields'].get(
                            constants.LIST_INNER_FIELD, {})
                        inner['type'] = catalog.fieldTypeToString(inner_type)
                        try:
                            inner_size = catalog.getEstimatedSize(
                                inner['type'], doc[k][i])
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error("Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s",\
                                      i, k, col_info['name'], pformat(fields[k]))
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]",
                                self.total_field_ctr, self.err_field_ctr)
                            continue

                        fields[k]['fields'][constants.LIST_INNER_FIELD] = inner
                        fields[k]['size_histogram'].put(inner_size)
                        fields[k]['distinct_values'].add(doc[k][i])
                        fields[k]['num_values'] += 1
                        if parent_col:
                            fields[k]['parent_col'] = parent_col
                ## FOR (list)
            ## ----------------------------------------------
            ## SCALAR VALUES
            ## ----------------------------------------------
            else:
                try:
                    size = catalog.getEstimatedSize(fields[k]['type'], v)
                    self.total_field_ctr += 1
                except:
                    LOG.error("Failed to estimate size for field %s in collection %s\n%s",\
                              k, col_info['name'], pformat(fields[k]))
                    self.err_field_ctr += 1
                    LOG.info("Total fields so far [%s], error fields [%s]",
                             self.total_field_ctr, self.err_field_ctr)
                    continue

                col_info['data_size'] += size
                fields[k]['size_histogram'].put(size)
                fields[k]['distinct_values'].add(v)
                fields[k]['num_values'] += 1
                if parent_col:
                    fields[k]['parent_col'] = parent_col
コード例 #4
0
    def processDataFields(self, col_info, fields, doc):
        """
            Recursively traverse a single document and extract out the field information
        """
        if self.debug:
            LOG.debug("Extracting fields for document:\n%s" % pformat(doc))

        # Check if the current doc has parent_col, but this will only apply to its fields
        parent_col = doc.get("parent_col", None)

        for k, v in doc.iteritems():
            # Skip if this is the _id field
            if constants.SKIP_MONGODB_ID_FIELD and k == "_id":
                continue
            if k == constants.FUNCTIONAL_FIELD:
                continue
            f_type = type(v)
            f_type_str = catalog.fieldTypeToString(f_type)

            if not k in fields:
                # This is only subset of what we will compute for each field
                # See catalog.Collection for more information
                if self.debug:
                    LOG.debug("Creating new field entry for '%s'" % k)
                fields[k] = catalog.Collection.fieldFactory(k, f_type_str)
            else:
                fields[k]["type"] = f_type_str
                # Sanity check
                # This won't work if the data is not uniform
                # if v != None:
                # assert fields[k]['type'] == f_type_str, \
                # "Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k)

            # We will store the distinct values for each field in a set
            # that is embedded in the field. We will delete it when
            # we call computeFieldStats()
            if not "distinct_values" in fields[k]:
                fields[k]["distinct_values"] = set()
            if not "num_values" in fields[k]:
                fields[k]["num_values"] = 0
            # Likewise, we will also store a histogram for the different sizes
            # of each field. We will use this later on to compute the weighted average
            if not "size_histogram" in fields[k]:
                fields[k]["size_histogram"] = Histogram()
            # Maintain a histogram of list lengths
            if not "list_len" in fields[k]:
                fields[k]["list_len"] = Histogram()

            if fields[k]["query_use_count"] > 0 and not k in col_info["interesting"]:
                col_info["interesting"].append(k)

            ## ----------------------------------------------
            ## NESTED FIELDS
            ## ----------------------------------------------
            if isinstance(v, dict):
                # Check for a special data field
                if len(v) == 1 and v.keys()[0].startswith(constants.REPLACE_KEY_DOLLAR_PREFIX):
                    v = v[v.keys()[0]]
                    # HACK to handle lists (hopefully dict as well)from nested IN clauses...
                    all_values = v if isinstance(v, list) else [v]
                    for v in all_values:
                        if isinstance(v, dict):
                            v = v.values()[0]

                        fields[k]["type"] = catalog.fieldTypeToString(type(v))
                        try:
                            size = catalog.getEstimatedSize(fields[k]["type"], v)
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error(
                                    "Failed to estimate size for field '%s' in collection '%s'\n%s",
                                    k,
                                    col_info["name"],
                                    pformat(fields[k]),
                                )
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr
                            )
                            continue
                        col_info["data_size"] += size
                        fields[k]["size_histogram"].put(size)
                        fields[k]["distinct_values"].add(v)
                        fields[k]["num_values"] += 1
                        if parent_col:
                            fields[k]["parent_col"] = parent_col
                    ## FOR
                else:
                    if self.debug:
                        LOG.debug("Extracting keys in nested field for '%s'" % k)
                    if not "fields" in fields[k]:
                        fields[k]["fields"] = {}
                    self.processDataFields(col_info, fields[k]["fields"], doc[k])

            ## ----------------------------------------------
            ## LIST OF VALUES
            ## Could be either scalars or dicts. If it's a dict, then we'll just
            ## store the nested field information in the 'fields' value
            ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to
            ## store the field information for the inner values.
            ## ----------------------------------------------
            elif isinstance(v, list):
                if self.debug:
                    LOG.debug("Extracting keys in nested list for '%s'" % k)
                if not "fields" in fields[k]:
                    fields[k]["fields"] = {}

                list_len = len(doc[k])
                fields[k]["list_len"].put(list_len)
                for i in xrange(list_len):
                    inner_type = type(doc[k][i])
                    # More nested documents...
                    if inner_type == dict:
                        if self.debug:
                            LOG.debug("Extracting keys in nested field in list position %d for '%s'" % (i, k))
                        self.processDataFields(col_info, fields[k]["fields"], doc[k][i])
                    else:
                        # TODO: We probably should store a list of types here in case
                        #       the list has different types of values
                        inner = fields[k]["fields"].get(constants.LIST_INNER_FIELD, {})
                        inner["type"] = catalog.fieldTypeToString(inner_type)
                        try:
                            inner_size = catalog.getEstimatedSize(inner["type"], doc[k][i])
                            self.total_field_ctr += 1
                        except:
                            if self.debug:
                                LOG.error(
                                    "Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s",
                                    i,
                                    k,
                                    col_info["name"],
                                    pformat(fields[k]),
                                )
                            self.err_field_ctr += 1
                            LOG.info(
                                "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr
                            )
                            continue

                        fields[k]["fields"][constants.LIST_INNER_FIELD] = inner
                        fields[k]["size_histogram"].put(inner_size)
                        fields[k]["distinct_values"].add(doc[k][i])
                        fields[k]["num_values"] += 1
                        if parent_col:
                            fields[k]["parent_col"] = parent_col
                ## FOR (list)
            ## ----------------------------------------------
            ## SCALAR VALUES
            ## ----------------------------------------------
            else:
                try:
                    size = catalog.getEstimatedSize(fields[k]["type"], v)
                    self.total_field_ctr += 1
                except:
                    LOG.error(
                        "Failed to estimate size for field %s in collection %s\n%s",
                        k,
                        col_info["name"],
                        pformat(fields[k]),
                    )
                    self.err_field_ctr += 1
                    LOG.info("Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr)
                    continue

                col_info["data_size"] += size
                fields[k]["size_histogram"].put(size)
                fields[k]["distinct_values"].add(v)
                fields[k]["num_values"] += 1
                if parent_col:
                    fields[k]["parent_col"] = parent_col