def setUp(self): # Create a fake Collection catalog entry # WORKLOAD self.col_info = catalog.Collection() self.col_info['name'] = COLLECTION_NAME self.col_info['doc_count'] = NUM_DOCUMENTS self.col_info['workload_queries'] = 1000 self.col_info['workload_percent'] = 1.0 for f in xrange(NUM_FIELDS+1): # We always need the _id field if not f: f_name = "_id" f_type = catalog.fieldTypeToString(int) f_size = catalog.getEstimatedSize(f_type, 10000) else: f_name = "field%02d" % f if f % 2 == 0: f_type = catalog.fieldTypeToString(long) f_size = catalog.getEstimatedSize(f_type, 10000000l) else: f_type = catalog.fieldTypeToString(str) f_size = 128 f = catalog.Collection.fieldFactory(f_name, f_type) f['avg_size'] = f_size f['query_use_count'] = self.col_info['workload_queries'] self.col_info['fields'][f_name] = f self.col_info['interesting'].append(f_name) self.col_info['avg_doc_size'] += f_size ## FOR (field) self.design = Design() self.design.addCollection(self.col_info['name']) self.design.addIndex(self.col_info['name'], ["_id"]) self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3]) self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE) self.buffer.initialize(self.design)
def setUp(self): # Create a fake Collection catalog entry # WORKLOAD self.col_info = catalog.Collection() self.col_info['name'] = COLLECTION_NAME self.col_info['doc_count'] = NUM_DOCUMENTS self.col_info['workload_queries'] = 1000 self.col_info['workload_percent'] = 1.0 for f in xrange(NUM_FIELDS + 1): # We always need the _id field if not f: f_name = "_id" f_type = catalog.fieldTypeToString(int) f_size = catalog.getEstimatedSize(f_type, 10000) else: f_name = "field%02d" % f if f % 2 == 0: f_type = catalog.fieldTypeToString(long) f_size = catalog.getEstimatedSize(f_type, 10000000l) else: f_type = catalog.fieldTypeToString(str) f_size = 128 f = catalog.Collection.fieldFactory(f_name, f_type) f['avg_size'] = f_size f['query_use_count'] = self.col_info['workload_queries'] self.col_info['fields'][f_name] = f self.col_info['interesting'].append(f_name) self.col_info['avg_doc_size'] += f_size ## FOR (field) self.design = Design() self.design.addCollection(self.col_info['name']) self.design.addIndex(self.col_info['name'], ["_id"]) self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3]) self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE) self.buffer.initialize(self.design)
def processDataFields(self, col_info, fields, doc): """ Recursively traverse a single document and extract out the field information """ if self.debug: LOG.debug("Extracting fields for document:\n%s" % pformat(doc)) # Check if the current doc has parent_col, but this will only apply to its fields parent_col = doc.get('parent_col', None) for k, v in doc.iteritems(): # Skip if this is the _id field if constants.SKIP_MONGODB_ID_FIELD and k == '_id': continue if k == constants.FUNCTIONAL_FIELD: continue f_type = type(v) f_type_str = catalog.fieldTypeToString(f_type) if not k in fields: # This is only subset of what we will compute for each field # See catalog.Collection for more information if self.debug: LOG.debug("Creating new field entry for '%s'" % k) fields[k] = catalog.Collection.fieldFactory(k, f_type_str) else: fields[k]['type'] = f_type_str # Sanity check # This won't work if the data is not uniform #if v != None: #assert fields[k]['type'] == f_type_str, \ #"Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k) # We will store the distinct values for each field in a set # that is embedded in the field. We will delete it when # we call computeFieldStats() if not 'distinct_values' in fields[k]: fields[k]['distinct_values'] = set() if not "num_values" in fields[k]: fields[k]['num_values'] = 0 # Likewise, we will also store a histogram for the different sizes # of each field. We will use this later on to compute the weighted average if not 'size_histogram' in fields[k]: fields[k]['size_histogram'] = Histogram() # Maintain a histogram of list lengths if not 'list_len' in fields[k]: fields[k]['list_len'] = Histogram() if fields[k]['query_use_count'] > 0 and not k in col_info[ 'interesting']: col_info['interesting'].append(k) ## ---------------------------------------------- ## NESTED FIELDS ## ---------------------------------------------- if isinstance(v, dict): # Check for a special data field if len(v) == 1 and v.keys()[0].startswith( constants.REPLACE_KEY_DOLLAR_PREFIX): v = v[v.keys()[0]] # HACK to handle lists (hopefully dict as well)from nested IN clauses... all_values = v if isinstance(v, list) else [v] for v in all_values: if isinstance(v, dict): v = v.values()[0] fields[k]['type'] = catalog.fieldTypeToString(type(v)) try: size = catalog.getEstimatedSize( fields[k]['type'], v) self.total_field_ctr += 1 except: if self.debug: LOG.error("Failed to estimate size for field '%s' in collection '%s'\n%s", \ k, col_info['name'], pformat(fields[k])) self.err_field_ctr += 1 LOG.info( "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue col_info['data_size'] += size fields[k]['size_histogram'].put(size) fields[k]['distinct_values'].add(v) fields[k]['num_values'] += 1 if parent_col: fields[k]['parent_col'] = parent_col ## FOR else: if self.debug: LOG.debug("Extracting keys in nested field for '%s'" % k) if not 'fields' in fields[k]: fields[k]['fields'] = {} self.processDataFields(col_info, fields[k]['fields'], doc[k]) ## ---------------------------------------------- ## LIST OF VALUES ## Could be either scalars or dicts. If it's a dict, then we'll just ## store the nested field information in the 'fields' value ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to ## store the field information for the inner values. ## ---------------------------------------------- elif isinstance(v, list): if self.debug: LOG.debug("Extracting keys in nested list for '%s'" % k) if not 'fields' in fields[k]: fields[k]['fields'] = {} list_len = len(doc[k]) fields[k]['list_len'].put(list_len) for i in xrange(list_len): inner_type = type(doc[k][i]) # More nested documents... if inner_type == dict: if self.debug: LOG.debug( "Extracting keys in nested field in list position %d for '%s'" % (i, k)) self.processDataFields(col_info, fields[k]['fields'], doc[k][i]) else: # TODO: We probably should store a list of types here in case # the list has different types of values inner = fields[k]['fields'].get( constants.LIST_INNER_FIELD, {}) inner['type'] = catalog.fieldTypeToString(inner_type) try: inner_size = catalog.getEstimatedSize( inner['type'], doc[k][i]) self.total_field_ctr += 1 except: if self.debug: LOG.error("Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s",\ i, k, col_info['name'], pformat(fields[k])) self.err_field_ctr += 1 LOG.info( "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue fields[k]['fields'][constants.LIST_INNER_FIELD] = inner fields[k]['size_histogram'].put(inner_size) fields[k]['distinct_values'].add(doc[k][i]) fields[k]['num_values'] += 1 if parent_col: fields[k]['parent_col'] = parent_col ## FOR (list) ## ---------------------------------------------- ## SCALAR VALUES ## ---------------------------------------------- else: try: size = catalog.getEstimatedSize(fields[k]['type'], v) self.total_field_ctr += 1 except: LOG.error("Failed to estimate size for field %s in collection %s\n%s",\ k, col_info['name'], pformat(fields[k])) self.err_field_ctr += 1 LOG.info("Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue col_info['data_size'] += size fields[k]['size_histogram'].put(size) fields[k]['distinct_values'].add(v) fields[k]['num_values'] += 1 if parent_col: fields[k]['parent_col'] = parent_col
def processDataFields(self, col_info, fields, doc): """ Recursively traverse a single document and extract out the field information """ if self.debug: LOG.debug("Extracting fields for document:\n%s" % pformat(doc)) # Check if the current doc has parent_col, but this will only apply to its fields parent_col = doc.get("parent_col", None) for k, v in doc.iteritems(): # Skip if this is the _id field if constants.SKIP_MONGODB_ID_FIELD and k == "_id": continue if k == constants.FUNCTIONAL_FIELD: continue f_type = type(v) f_type_str = catalog.fieldTypeToString(f_type) if not k in fields: # This is only subset of what we will compute for each field # See catalog.Collection for more information if self.debug: LOG.debug("Creating new field entry for '%s'" % k) fields[k] = catalog.Collection.fieldFactory(k, f_type_str) else: fields[k]["type"] = f_type_str # Sanity check # This won't work if the data is not uniform # if v != None: # assert fields[k]['type'] == f_type_str, \ # "Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k) # We will store the distinct values for each field in a set # that is embedded in the field. We will delete it when # we call computeFieldStats() if not "distinct_values" in fields[k]: fields[k]["distinct_values"] = set() if not "num_values" in fields[k]: fields[k]["num_values"] = 0 # Likewise, we will also store a histogram for the different sizes # of each field. We will use this later on to compute the weighted average if not "size_histogram" in fields[k]: fields[k]["size_histogram"] = Histogram() # Maintain a histogram of list lengths if not "list_len" in fields[k]: fields[k]["list_len"] = Histogram() if fields[k]["query_use_count"] > 0 and not k in col_info["interesting"]: col_info["interesting"].append(k) ## ---------------------------------------------- ## NESTED FIELDS ## ---------------------------------------------- if isinstance(v, dict): # Check for a special data field if len(v) == 1 and v.keys()[0].startswith(constants.REPLACE_KEY_DOLLAR_PREFIX): v = v[v.keys()[0]] # HACK to handle lists (hopefully dict as well)from nested IN clauses... all_values = v if isinstance(v, list) else [v] for v in all_values: if isinstance(v, dict): v = v.values()[0] fields[k]["type"] = catalog.fieldTypeToString(type(v)) try: size = catalog.getEstimatedSize(fields[k]["type"], v) self.total_field_ctr += 1 except: if self.debug: LOG.error( "Failed to estimate size for field '%s' in collection '%s'\n%s", k, col_info["name"], pformat(fields[k]), ) self.err_field_ctr += 1 LOG.info( "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr ) continue col_info["data_size"] += size fields[k]["size_histogram"].put(size) fields[k]["distinct_values"].add(v) fields[k]["num_values"] += 1 if parent_col: fields[k]["parent_col"] = parent_col ## FOR else: if self.debug: LOG.debug("Extracting keys in nested field for '%s'" % k) if not "fields" in fields[k]: fields[k]["fields"] = {} self.processDataFields(col_info, fields[k]["fields"], doc[k]) ## ---------------------------------------------- ## LIST OF VALUES ## Could be either scalars or dicts. If it's a dict, then we'll just ## store the nested field information in the 'fields' value ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to ## store the field information for the inner values. ## ---------------------------------------------- elif isinstance(v, list): if self.debug: LOG.debug("Extracting keys in nested list for '%s'" % k) if not "fields" in fields[k]: fields[k]["fields"] = {} list_len = len(doc[k]) fields[k]["list_len"].put(list_len) for i in xrange(list_len): inner_type = type(doc[k][i]) # More nested documents... if inner_type == dict: if self.debug: LOG.debug("Extracting keys in nested field in list position %d for '%s'" % (i, k)) self.processDataFields(col_info, fields[k]["fields"], doc[k][i]) else: # TODO: We probably should store a list of types here in case # the list has different types of values inner = fields[k]["fields"].get(constants.LIST_INNER_FIELD, {}) inner["type"] = catalog.fieldTypeToString(inner_type) try: inner_size = catalog.getEstimatedSize(inner["type"], doc[k][i]) self.total_field_ctr += 1 except: if self.debug: LOG.error( "Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s", i, k, col_info["name"], pformat(fields[k]), ) self.err_field_ctr += 1 LOG.info( "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr ) continue fields[k]["fields"][constants.LIST_INNER_FIELD] = inner fields[k]["size_histogram"].put(inner_size) fields[k]["distinct_values"].add(doc[k][i]) fields[k]["num_values"] += 1 if parent_col: fields[k]["parent_col"] = parent_col ## FOR (list) ## ---------------------------------------------- ## SCALAR VALUES ## ---------------------------------------------- else: try: size = catalog.getEstimatedSize(fields[k]["type"], v) self.total_field_ctr += 1 except: LOG.error( "Failed to estimate size for field %s in collection %s\n%s", k, col_info["name"], pformat(fields[k]), ) self.err_field_ctr += 1 LOG.info("Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue col_info["data_size"] += size fields[k]["size_histogram"].put(size) fields[k]["distinct_values"].add(v) fields[k]["num_values"] += 1 if parent_col: fields[k]["parent_col"] = parent_col