def processOpFields(self, fields, op, content): if self.debug: LOG.debug("Processing operation fields\n%s", pformat(content)) for k, v in content.iteritems(): # Skip anything that starts with our special char # Those are flag markers used by MongoDB's queries if k.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX): continue # We need to add the field to the collection if it doesn't # already exist. This will occur if this op was an aggregate, # which we ignore when recreating the schema f_type = type(v) if not k in fields: fields[k] = catalog.Collection.fieldFactory( k, catalog.fieldTypeToString(f_type)) fields[k]['query_use_count'] += 1 # No predicate for insert operations # No projections for insert operations if op['type'] != constants.OP_TYPE_INSERT: # Update how this key was used with predicates if workload.isOpRegex(op, field=k): op['predicates'][k] = constants.PRED_TYPE_REGEX elif isinstance(v, dict): op['predicates'][k] = constants.PRED_TYPE_RANGE elif not k in op['predicates']: op['predicates'][k] = constants.PRED_TYPE_EQUALITY ## TODO: Should we expect there to be field names with dot notation here? ## Or should have all been cleaned out by the converters? ## FOR return
def processOpFields(self, fields, op, content): if self.debug: LOG.debug("Processing operation fields\n%s", pformat(content)) for k,v in content.iteritems(): # Skip anything that starts with our special char # Those are flag markers used by MongoDB's queries if k.startswith(constants.REPLACE_KEY_DOLLAR_PREFIX): continue # We need to add the field to the collection if it doesn't # already exist. This will occur if this op was an aggregate, # which we ignore when recreating the schema f_type = type(v) if not k in fields: fields[k] = catalog.Collection.fieldFactory(k, catalog.fieldTypeToString(f_type)) fields[k]['query_use_count'] += 1 # No predicate for insert operations # No projections for insert operations if op['type'] != constants.OP_TYPE_INSERT: # Update how this key was used with predicates if workload.isOpRegex(op, field=k): op['predicates'][k] = constants.PRED_TYPE_REGEX elif isinstance(v, dict): op['predicates'][k] = constants.PRED_TYPE_RANGE elif not k in op['predicates']: op['predicates'][k] = constants.PRED_TYPE_EQUALITY ## TODO: Should we expect there to be field names with dot notation here? ## Or should have all been cleaned out by the converters? ## FOR return
def extractSchema(self): c1 = self.mysql_conn.cursor() c1.execute(""" SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA = %s AND TABLE_NAME != %s""", \ (self.dbName, MYSQL_LOG_TABLE_NAME)) tbl_cols = {} LOG.info("Extracting table information from database '%s'", self.dbName) for row in c1: tbl_name = row[0] col_info = self.metadata_db.Collection() col_info['name'] = tbl_name tbl_cols[col_info['name']] = [] if self.debug: LOG.debug("Created metadata object for collection '%s'", tbl_name) c2 = self.mysql_conn.cursor() c2.execute(""" SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME=%s """, (self.dbName, tbl_name)) for col_row in c2: col_name = col_row[0] col_type = catalog.sqlTypeToPython(col_row[1]) col_position = col_row[2] tbl_cols[col_info['name']].append(col_name) col_type_str = catalog.fieldTypeToString(col_type) col_info["fields"][col_name] = catalog.Collection.fieldFactory(col_name, col_type_str) col_info["fields"][col_name]['ordinal_position'] = int(col_position) if self.debug: LOG.info("Created column information for '%s.%s'", tbl_name, col_name) ## FOR c2.close() # Get the index information from MySQL for this table sql = "SHOW INDEXES FROM " + self.dbName + "." + tbl_name c3 = self.mysql_conn.cursor() c3.execute(sql) index_name = None LOG.info("Extracting index information from table '%s'", tbl_name) # FIXME #for ind_row in c3: #if index_name <> ind_row[2]: #print pformat(ind_row) #col_info['indexes'][ind_row[2]] = [] #index_name = ind_row[2] #col_info['indexes'][ind_row[2]].append(ind_row[4]) ## FOR col_info.save() ## ----------------------------------------------------------- ## EXTRACT DATA ## ----------------------------------------------------------- if not self.no_mysql_dataset: self.extractData(tbl_name, tbl_cols[tbl_name])
def extractSchema(self): c1 = self.mysql_conn.cursor() c1.execute(""" SELECT TABLE_NAME FROM information_schema.TABLES WHERE TABLE_SCHEMA = %s AND TABLE_NAME != %s""", \ (self.dbName, MYSQL_LOG_TABLE_NAME)) tbl_cols = {} LOG.info("Extracting table information from database '%s'", self.dbName) for row in c1: tbl_name = row[0] col_info = self.metadata_db.Collection() col_info['name'] = tbl_name tbl_cols[col_info['name']] = [] if self.debug: LOG.debug("Created metadata object for collection '%s'", tbl_name) c2 = self.mysql_conn.cursor() c2.execute(""" SELECT COLUMN_NAME, DATA_TYPE, ORDINAL_POSITION FROM information_schema.COLUMNS WHERE TABLE_SCHEMA = %s AND TABLE_NAME=%s """, (self.dbName, tbl_name)) for col_row in c2: col_name = col_row[0] col_type = catalog.sqlTypeToPython(col_row[1]) col_position = col_row[2] tbl_cols[col_info['name']].append(col_name) col_type_str = catalog.fieldTypeToString(col_type) col_info["fields"][col_name] = catalog.Collection.fieldFactory(col_name, col_type_str) col_info["fields"][col_name]['ordinal_position'] = int(col_position) if self.debug: LOG.info("Created column information for '%s.%s'", tbl_name, col_name) ## FOR c2.close() # Get the index information from MySQL for this table sql = "SHOW INDEXES FROM " + self.dbName + "." + tbl_name c3 = self.mysql_conn.cursor() c3.execute(sql) index_name = None LOG.info("Extracting index information from table '%s'", tbl_name) # FIXME #for ind_row in c3: #if index_name <> ind_row[2]: #print pformat(ind_row) #col_info['indexes'][ind_row[2]] = [] #index_name = ind_row[2] #col_info['indexes'][ind_row[2]].append(ind_row[4]) ## FOR col_info.save() ## ----------------------------------------------------------- ## EXTRACT DATA ## ----------------------------------------------------------- if not self.no_mysql_dataset: self.extractData(tbl_name, tbl_cols[tbl_name])
def testFieldTypeSerialization(self): for t in [int, str, unicode, float]: t_bson = catalog.fieldTypeToString(t) self.assertFalse(t_bson == None) #print "BSON:", t_bson t_python = catalog.fieldTypeToPython(t_bson) self.assertFalse(t_python == None) #print "PYTHON:", t_python self.assertEquals(t, t_python)
def testFieldTypeSerialization(self): for t in [ int, str, unicode, float ]: t_bson = catalog.fieldTypeToString(t) self.assertFalse(t_bson == None) #print "BSON:", t_bson t_python = catalog.fieldTypeToPython(t_bson) self.assertFalse(t_python == None) #print "PYTHON:", t_python self.assertEquals(t, t_python)
def setUp(self): # Create a fake Collection catalog entry # WORKLOAD self.col_info = catalog.Collection() self.col_info['name'] = COLLECTION_NAME self.col_info['doc_count'] = NUM_DOCUMENTS self.col_info['workload_queries'] = 1000 self.col_info['workload_percent'] = 1.0 for f in xrange(NUM_FIELDS + 1): # We always need the _id field if not f: f_name = "_id" f_type = catalog.fieldTypeToString(int) f_size = catalog.getEstimatedSize(f_type, 10000) else: f_name = "field%02d" % f if f % 2 == 0: f_type = catalog.fieldTypeToString(long) f_size = catalog.getEstimatedSize(f_type, 10000000l) else: f_type = catalog.fieldTypeToString(str) f_size = 128 f = catalog.Collection.fieldFactory(f_name, f_type) f['avg_size'] = f_size f['query_use_count'] = self.col_info['workload_queries'] self.col_info['fields'][f_name] = f self.col_info['interesting'].append(f_name) self.col_info['avg_doc_size'] += f_size ## FOR (field) self.design = Design() self.design.addCollection(self.col_info['name']) self.design.addIndex(self.col_info['name'], ["_id"]) self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3]) self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE) self.buffer.initialize(self.design)
def setUp(self): # Create a fake Collection catalog entry # WORKLOAD self.col_info = catalog.Collection() self.col_info['name'] = COLLECTION_NAME self.col_info['doc_count'] = NUM_DOCUMENTS self.col_info['workload_queries'] = 1000 self.col_info['workload_percent'] = 1.0 for f in xrange(NUM_FIELDS+1): # We always need the _id field if not f: f_name = "_id" f_type = catalog.fieldTypeToString(int) f_size = catalog.getEstimatedSize(f_type, 10000) else: f_name = "field%02d" % f if f % 2 == 0: f_type = catalog.fieldTypeToString(long) f_size = catalog.getEstimatedSize(f_type, 10000000l) else: f_type = catalog.fieldTypeToString(str) f_size = 128 f = catalog.Collection.fieldFactory(f_name, f_type) f['avg_size'] = f_size f['query_use_count'] = self.col_info['workload_queries'] self.col_info['fields'][f_name] = f self.col_info['interesting'].append(f_name) self.col_info['avg_doc_size'] += f_size ## FOR (field) self.design = Design() self.design.addCollection(self.col_info['name']) self.design.addIndex(self.col_info['name'], ["_id"]) self.design.addIndex(self.col_info['name'], self.col_info['interesting'][1:3]) self.buffer = LRUBuffer({self.col_info['name']: self.col_info}, BUFFER_SIZE) self.buffer.initialize(self.design)
def processDataFields(self, col_info, fields, doc): """ Recursively traverse a single document and extract out the field information """ if self.debug: LOG.debug("Extracting fields for document:\n%s" % pformat(doc)) # Check if the current doc has parent_col, but this will only apply to its fields parent_col = doc.get('parent_col', None) for k, v in doc.iteritems(): # Skip if this is the _id field if constants.SKIP_MONGODB_ID_FIELD and k == '_id': continue if k == constants.FUNCTIONAL_FIELD: continue f_type = type(v) f_type_str = catalog.fieldTypeToString(f_type) if not k in fields: # This is only subset of what we will compute for each field # See catalog.Collection for more information if self.debug: LOG.debug("Creating new field entry for '%s'" % k) fields[k] = catalog.Collection.fieldFactory(k, f_type_str) else: fields[k]['type'] = f_type_str # Sanity check # This won't work if the data is not uniform #if v != None: #assert fields[k]['type'] == f_type_str, \ #"Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k) # We will store the distinct values for each field in a set # that is embedded in the field. We will delete it when # we call computeFieldStats() if not 'distinct_values' in fields[k]: fields[k]['distinct_values'] = set() if not "num_values" in fields[k]: fields[k]['num_values'] = 0 # Likewise, we will also store a histogram for the different sizes # of each field. We will use this later on to compute the weighted average if not 'size_histogram' in fields[k]: fields[k]['size_histogram'] = Histogram() # Maintain a histogram of list lengths if not 'list_len' in fields[k]: fields[k]['list_len'] = Histogram() if fields[k]['query_use_count'] > 0 and not k in col_info[ 'interesting']: col_info['interesting'].append(k) ## ---------------------------------------------- ## NESTED FIELDS ## ---------------------------------------------- if isinstance(v, dict): # Check for a special data field if len(v) == 1 and v.keys()[0].startswith( constants.REPLACE_KEY_DOLLAR_PREFIX): v = v[v.keys()[0]] # HACK to handle lists (hopefully dict as well)from nested IN clauses... all_values = v if isinstance(v, list) else [v] for v in all_values: if isinstance(v, dict): v = v.values()[0] fields[k]['type'] = catalog.fieldTypeToString(type(v)) try: size = catalog.getEstimatedSize( fields[k]['type'], v) self.total_field_ctr += 1 except: if self.debug: LOG.error("Failed to estimate size for field '%s' in collection '%s'\n%s", \ k, col_info['name'], pformat(fields[k])) self.err_field_ctr += 1 LOG.info( "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue col_info['data_size'] += size fields[k]['size_histogram'].put(size) fields[k]['distinct_values'].add(v) fields[k]['num_values'] += 1 if parent_col: fields[k]['parent_col'] = parent_col ## FOR else: if self.debug: LOG.debug("Extracting keys in nested field for '%s'" % k) if not 'fields' in fields[k]: fields[k]['fields'] = {} self.processDataFields(col_info, fields[k]['fields'], doc[k]) ## ---------------------------------------------- ## LIST OF VALUES ## Could be either scalars or dicts. If it's a dict, then we'll just ## store the nested field information in the 'fields' value ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to ## store the field information for the inner values. ## ---------------------------------------------- elif isinstance(v, list): if self.debug: LOG.debug("Extracting keys in nested list for '%s'" % k) if not 'fields' in fields[k]: fields[k]['fields'] = {} list_len = len(doc[k]) fields[k]['list_len'].put(list_len) for i in xrange(list_len): inner_type = type(doc[k][i]) # More nested documents... if inner_type == dict: if self.debug: LOG.debug( "Extracting keys in nested field in list position %d for '%s'" % (i, k)) self.processDataFields(col_info, fields[k]['fields'], doc[k][i]) else: # TODO: We probably should store a list of types here in case # the list has different types of values inner = fields[k]['fields'].get( constants.LIST_INNER_FIELD, {}) inner['type'] = catalog.fieldTypeToString(inner_type) try: inner_size = catalog.getEstimatedSize( inner['type'], doc[k][i]) self.total_field_ctr += 1 except: if self.debug: LOG.error("Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s",\ i, k, col_info['name'], pformat(fields[k])) self.err_field_ctr += 1 LOG.info( "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue fields[k]['fields'][constants.LIST_INNER_FIELD] = inner fields[k]['size_histogram'].put(inner_size) fields[k]['distinct_values'].add(doc[k][i]) fields[k]['num_values'] += 1 if parent_col: fields[k]['parent_col'] = parent_col ## FOR (list) ## ---------------------------------------------- ## SCALAR VALUES ## ---------------------------------------------- else: try: size = catalog.getEstimatedSize(fields[k]['type'], v) self.total_field_ctr += 1 except: LOG.error("Failed to estimate size for field %s in collection %s\n%s",\ k, col_info['name'], pformat(fields[k])) self.err_field_ctr += 1 LOG.info("Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue col_info['data_size'] += size fields[k]['size_histogram'].put(size) fields[k]['distinct_values'].add(v) fields[k]['num_values'] += 1 if parent_col: fields[k]['parent_col'] = parent_col
def processDataFields(self, col_info, fields, doc): """ Recursively traverse a single document and extract out the field information """ if self.debug: LOG.debug("Extracting fields for document:\n%s" % pformat(doc)) # Check if the current doc has parent_col, but this will only apply to its fields parent_col = doc.get("parent_col", None) for k, v in doc.iteritems(): # Skip if this is the _id field if constants.SKIP_MONGODB_ID_FIELD and k == "_id": continue if k == constants.FUNCTIONAL_FIELD: continue f_type = type(v) f_type_str = catalog.fieldTypeToString(f_type) if not k in fields: # This is only subset of what we will compute for each field # See catalog.Collection for more information if self.debug: LOG.debug("Creating new field entry for '%s'" % k) fields[k] = catalog.Collection.fieldFactory(k, f_type_str) else: fields[k]["type"] = f_type_str # Sanity check # This won't work if the data is not uniform # if v != None: # assert fields[k]['type'] == f_type_str, \ # "Mismatched field types '%s' <> '%s' for '%s'" % (fields[k]['type'], f_type_str, k) # We will store the distinct values for each field in a set # that is embedded in the field. We will delete it when # we call computeFieldStats() if not "distinct_values" in fields[k]: fields[k]["distinct_values"] = set() if not "num_values" in fields[k]: fields[k]["num_values"] = 0 # Likewise, we will also store a histogram for the different sizes # of each field. We will use this later on to compute the weighted average if not "size_histogram" in fields[k]: fields[k]["size_histogram"] = Histogram() # Maintain a histogram of list lengths if not "list_len" in fields[k]: fields[k]["list_len"] = Histogram() if fields[k]["query_use_count"] > 0 and not k in col_info["interesting"]: col_info["interesting"].append(k) ## ---------------------------------------------- ## NESTED FIELDS ## ---------------------------------------------- if isinstance(v, dict): # Check for a special data field if len(v) == 1 and v.keys()[0].startswith(constants.REPLACE_KEY_DOLLAR_PREFIX): v = v[v.keys()[0]] # HACK to handle lists (hopefully dict as well)from nested IN clauses... all_values = v if isinstance(v, list) else [v] for v in all_values: if isinstance(v, dict): v = v.values()[0] fields[k]["type"] = catalog.fieldTypeToString(type(v)) try: size = catalog.getEstimatedSize(fields[k]["type"], v) self.total_field_ctr += 1 except: if self.debug: LOG.error( "Failed to estimate size for field '%s' in collection '%s'\n%s", k, col_info["name"], pformat(fields[k]), ) self.err_field_ctr += 1 LOG.info( "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr ) continue col_info["data_size"] += size fields[k]["size_histogram"].put(size) fields[k]["distinct_values"].add(v) fields[k]["num_values"] += 1 if parent_col: fields[k]["parent_col"] = parent_col ## FOR else: if self.debug: LOG.debug("Extracting keys in nested field for '%s'" % k) if not "fields" in fields[k]: fields[k]["fields"] = {} self.processDataFields(col_info, fields[k]["fields"], doc[k]) ## ---------------------------------------------- ## LIST OF VALUES ## Could be either scalars or dicts. If it's a dict, then we'll just ## store the nested field information in the 'fields' value ## If it's a list, then we'll use a special marker 'LIST_INNER_FIELD' to ## store the field information for the inner values. ## ---------------------------------------------- elif isinstance(v, list): if self.debug: LOG.debug("Extracting keys in nested list for '%s'" % k) if not "fields" in fields[k]: fields[k]["fields"] = {} list_len = len(doc[k]) fields[k]["list_len"].put(list_len) for i in xrange(list_len): inner_type = type(doc[k][i]) # More nested documents... if inner_type == dict: if self.debug: LOG.debug("Extracting keys in nested field in list position %d for '%s'" % (i, k)) self.processDataFields(col_info, fields[k]["fields"], doc[k][i]) else: # TODO: We probably should store a list of types here in case # the list has different types of values inner = fields[k]["fields"].get(constants.LIST_INNER_FIELD, {}) inner["type"] = catalog.fieldTypeToString(inner_type) try: inner_size = catalog.getEstimatedSize(inner["type"], doc[k][i]) self.total_field_ctr += 1 except: if self.debug: LOG.error( "Failed to estimate size for list entry #%d for field '%s' in collection '%s'\n%s", i, k, col_info["name"], pformat(fields[k]), ) self.err_field_ctr += 1 LOG.info( "Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr ) continue fields[k]["fields"][constants.LIST_INNER_FIELD] = inner fields[k]["size_histogram"].put(inner_size) fields[k]["distinct_values"].add(doc[k][i]) fields[k]["num_values"] += 1 if parent_col: fields[k]["parent_col"] = parent_col ## FOR (list) ## ---------------------------------------------- ## SCALAR VALUES ## ---------------------------------------------- else: try: size = catalog.getEstimatedSize(fields[k]["type"], v) self.total_field_ctr += 1 except: LOG.error( "Failed to estimate size for field %s in collection %s\n%s", k, col_info["name"], pformat(fields[k]), ) self.err_field_ctr += 1 LOG.info("Total fields so far [%s], error fields [%s]", self.total_field_ctr, self.err_field_ctr) continue col_info["data_size"] += size fields[k]["size_histogram"].put(size) fields[k]["distinct_values"].add(v) fields[k]["num_values"] += 1 if parent_col: fields[k]["parent_col"] = parent_col