def fixInvalidCollections(self): searchKey = { "operations.collection": constants.INVALID_COLLECTION_MARKER } for session in self.metadata_db.Session.find(searchKey): for op in session["operations"]: dirty = False if op["collection"] != constants.INVALID_COLLECTION_MARKER: continue if self.debug: LOG.debug("Attempting to fix corrupted Operation:\n%s" % pformat(op)) # For each field referenced in the query, build a histogram of # which collections have a field with the same name fields = workload.getReferencedFields(op) h = Histogram() for c in self.metadata_db.Collection.find(): for f in c['fields']: if f in fields: h.put(c['name']) ## FOR ## FOR matches = h.getMaxCountKeys() if len(matches) == 0: LOG.warn( "No matching collection was found for corrupted operation\n%s" % pformat(op)) continue elif len(matches) > 1: LOG.warn( "More than one matching collection was found for corrupted operation %s\n%s" % (matches, pformat(op))) continue else: op["collection"] = matches[0] dirty = True self.fix_ctr += 1 LOG.info("Fix corrupted collection in operation\n%s" % pformat(op)) ## IF ## FOR (operations) if dirty: session.save()
def fixInvalidCollections(self): searchKey = {"operations.collection": constants.INVALID_COLLECTION_MARKER} for session in self.metadata_db.Session.find(searchKey): for op in session["operations"]: dirty = False if op["collection"] != constants.INVALID_COLLECTION_MARKER: continue if self.debug: LOG.debug("Attempting to fix corrupted Operation:\n%s" % pformat(op)) # For each field referenced in the query, build a histogram of # which collections have a field with the same name fields = workload.getReferencedFields(op) h = Histogram() for c in self.metadata_db.Collection.find(): for f in c["fields"]: if f in fields: h.put(c["name"]) ## FOR ## FOR matches = h.getMaxCountKeys() if len(matches) == 0: LOG.warn("No matching collection was found for corrupted operation\n%s" % pformat(op)) continue elif len(matches) > 1: LOG.warn( "More than one matching collection was found for corrupted operation %s\n%s" % (matches, pformat(op)) ) continue else: op["collection"] = matches[0] dirty = True self.fix_ctr += 1 LOG.info("Fix corrupted collection in operation\n%s" % pformat(op)) ## IF ## FOR (operations) if dirty: session.save()
def hash(self, op): """Compute a deterministic signature for the given operation based on its keys""" fields = None updateFields = None # QUERY if op["type"] == constants.OP_TYPE_QUERY: # The query field has our where clause if not "#query" in op["query_content"][0]: msg = "Missing query field in query_content for operation #%d" % op["query_id"] if self.debug: LOG.warn(pformat(op)) raise Exception(msg) fields = op["query_content"][0][constants.REPLACE_KEY_DOLLAR_PREFIX + "query"] # UPDATE elif op["type"] == constants.OP_TYPE_UPDATE: # The first element in the content field is the WHERE clause fields = op["query_content"][0] # We use a separate field for the updated columns so that updateFields = op['query_content'][1] # INSERT elif op["type"] == constants.OP_TYPE_INSERT: # They could be inserting more than one document here, # which all may have different fields... # So we will need to build a histogram for which keys are referenced # and use the onese that appear the most # XXX: We'll only consider keys in the first-level h = Histogram() for doc in op["query_content"]: assert type(doc) == dict, "Unexpected insert value:\n%s" % pformat(doc) for k in doc.keys(): h.put(k) ## FOR if LOG.isEnabledFor(logging.DEBUG): LOG.debug("Insert '%s' Keys Histogram:\n%s" % (op["collection"], h)) maxKeys = h.getMaxCountKeys() assert len(maxKeys) > 0, \ "No keys were found in %d insert documents?" % len(op["query_content"]) fields = { } for doc in op["query_content"]: for k, v in doc.iteritems(): if k in maxKeys: fields[k] = v ## FOR ## FOR # DELETE elif op["type"] == constants.OP_TYPE_DELETE: # The first element in the content field is the WHERE clause fields = op["query_content"][0] # UNKNOWN! else: raise Exception("Unexpected query type: %s" % op["type"]) # Extract the list of fields that are used try: fieldsHash = self.computeFieldsHash(fields) except: LOG.error("Unexpected error when processing operation %d [fields=%s]" % (op["query_id"], str(fields))) raise updateHash = self.computeFieldsHash(updateFields) if updateFields else None t = (op["collection"], op["type"], fieldsHash, updateHash) h = long(hash(t)) LOG.debug("%s %s => HASH:%d" % (fields, t, h)) self.histogram.put(h) return h