def assert_valid_state_migrations(self, content_store: ContentStore, migration_batch: int = 50): if self._if_all_items_have_state(content_store): print(f"Pipeline {self.name} has migrated state for all items") return # quickly check count first # if sum of all state docs < total count then there are docs that are unmigrated and it has to be invalid sum_migrated_count = 0 for state in self.states: sum_migrated_count += content_store.count(state.migrate_q) total_count = content_store.count({}) if sum_migrated_count < total_count: print( f"Sampling 100 object IDs that will miss migration {self._object_ids_missing_migration(content_store)[: 100]}" ) raise RuntimeError( f"Sum of all state docs does not match total count. sum={sum_migrated_count} < actual={total_count}" ) # check if there is overlap in state docs # combined with the check that count matches, it means # every item in content store belongs to one and only one state # and all items in content store are accounted for doc_id_to_state = {} # type: Dict[str, PipelineState] for state in self.states: state_name = state.name migrate_q = state.migrate_q migrated_count = content_store.count(state.migrate_q) migration_batches = migrated_count // migration_batch for i in range(migration_batches): print( f"Checking pipeline {self.name} state {state_name} migration batch {i + 1}/{migration_batches}" ) for doc in content_store.query(migrate_q, skip=i * migration_batch, limit=migration_batch): doc_id = doc["_id"] if doc_id in doc_id_to_state: raise RuntimeError( f"Document with id {doc_id} is going to have " f"both {doc_id_to_state[doc_id].name} and {state.name} state" ) doc_id_to_state[doc_id] = state print( f"Checking pipeline {self.name} state {state_name} last migration batch" ) for doc in content_store.query(migrate_q, skip=migration_batches * migration_batch): doc_id = doc["_id"] if doc_id in doc_id_to_state: raise RuntimeError( f"Document with id {doc_id} is going to have " f"both {doc_id_to_state[doc_id].name} and {state.name} state" ) doc_id_to_state[doc_id] = state
def query( content_store: ContentStore, query_params: Dict, projection: List[str], limit: int, additional_q: Optional[Dict] = None, ) -> Dict: if additional_q is None: additional_q = {} # get results if "from" in query_params: from_id = bson.ObjectId(query_params["from"]) q = additional_q.copy() q["_id"] = {"$lt": from_id} results = content_store.query( q=q, projection=projection, limit=limit, sort={"_id": -1}, ) elif "to" in query_params: to_id = bson.ObjectId(query_params["to"]) q = additional_q.copy() q["_id"] = {"$gt": to_id} results = content_store.query( q=q, projection=projection, limit=limit, sort={"_id": 1}, ) results = list(reversed(results)) else: results = content_store.query( q=additional_q, projection=projection, limit=limit, sort={"_id": -1} ) if not results: return { "has_prev": False, "has_next": False, "results": list(map(add_created_at, results)), } # get next next_from_id = bson.ObjectId(results[-1]["_id"]) q = additional_q.copy() q["_id"] = {"$lt": next_from_id} has_next = content_store.count(q) != 0 # get prev prev_to_id = bson.ObjectId(results[0]["_id"]) q = additional_q.copy() q["_id"] = {"$gt": prev_to_id} has_prev = content_store.count(q) != 0 return { "has_prev": has_prev, "prev_to": str(prev_to_id), "has_next": has_next, "next_from": str(next_from_id), "results": list(map(add_created_at, results)), }