def get_unique_metric_variants(cls, company_id, project_ids: Sequence[str], include_subprojects: bool): pipeline = [ { "$match": { **cls._get_company_constraint(company_id), **cls._get_project_constraint(project_ids, include_subprojects), } }, { "$project": { "metrics": { "$objectToArray": "$last_metrics" } } }, { "$unwind": "$metrics" }, { "$project": { "metric": "$metrics.k", "variants": { "$objectToArray": "$metrics.v" }, } }, { "$unwind": "$variants" }, { "$group": { "_id": { "metric": "$variants.v.metric", "variant": "$variants.v.variant", }, "metrics": { "$addToSet": { "metric": "$variants.v.metric", "metric_hash": "$metric", "variant": "$variants.v.variant", "variant_hash": "$variants.k", } }, } }, { "$sort": OrderedDict({ "_id.metric": 1, "_id.variant": 1 }) }, ] result = Task.aggregate(pipeline) return [r["metrics"][0] for r in result]
def get_configuration_names(cls, company_id: str, task_ids: Sequence[str]) -> Dict[str, list]: with TimingContext("mongo", "get_configuration_names"): pipeline = [ { "$match": { "company": { "$in": [None, "", company_id] }, "_id": { "$in": task_ids }, } }, { "$project": { "items": { "$objectToArray": "$configuration" } } }, { "$unwind": "$items" }, { "$group": { "_id": "$_id", "names": { "$addToSet": "$items.k" } } }, ] tasks = Task.aggregate(pipeline) return { task["_id"]: { "names": sorted( ParameterKeyEscaper.unescape(name) for name in task["names"]) } for task in tasks }
def _get_experiments_stats(cls, company_id, workers: Optional[Sequence] = None) -> dict: pipeline = [ { "$match": { "company": company_id, "started": { "$exists": True, "$ne": None }, "last_update": { "$exists": True, "$ne": None }, "status": { "$nin": ["created", "queued"] }, **({ "last_worker": { "$in": workers } } if workers else {}), } }, { "$group": { "_id": "$last_worker" if workers else None, "count": { "$sum": 1 }, "avg_run_time_sec": { "$avg": { "$divide": [ { "$subtract": ["$last_update", "$started"] }, 1000, ] } }, "avg_iterations": { "$avg": "$last_iteration" }, } }, { "$project": { "count": 1, "avg_run_time_sec": { "$trunc": "$avg_run_time_sec" }, "avg_iterations": { "$trunc": "$avg_iterations" }, } }, ] return { group["_id"]: {k: v for k, v in group.items() if k != "_id"} for group in Task.aggregate(pipeline) }
def get_aggregated_project_parameters( cls, company_id, project_ids: Sequence[str], include_subprojects: bool, page: int = 0, page_size: int = 500, ) -> Tuple[int, int, Sequence[dict]]: page = max(0, page) page_size = max(1, page_size) pipeline = [ { "$match": { **cls._get_company_constraint(company_id), **cls._get_project_constraint(project_ids, include_subprojects), "hyperparams": { "$exists": True, "$gt": {} }, } }, { "$project": { "sections": { "$objectToArray": "$hyperparams" } } }, { "$unwind": "$sections" }, { "$project": { "section": "$sections.k", "names": { "$objectToArray": "$sections.v" }, } }, { "$unwind": "$names" }, { "$group": { "_id": { "section": "$section", "name": "$names.k" } } }, { "$sort": OrderedDict({ "_id.section": 1, "_id.name": 1 }) }, { "$skip": page * page_size }, { "$limit": page_size }, { "$group": { "_id": 1, "total": { "$sum": 1 }, "results": { "$push": "$$ROOT" }, } }, ] result = next(Task.aggregate(pipeline), None) total = 0 remaining = 0 results = [] if result: total = int(result.get("total", -1)) results = [{ "section": ParameterKeyEscaper.unescape(nested_get(r, ("_id", "section"))), "name": ParameterKeyEscaper.unescape(nested_get(r, ("_id", "name"))), } for r in result.get("results", [])] remaining = max(0, total - (len(results) + page * page_size)) return total, remaining, results
def get_task_hyperparam_distinct_values( self, company_id: str, project_ids: Sequence[str], section: str, name: str, include_subprojects: bool, allow_public: bool = True, ) -> ParamValues: company_constraint = self._get_company_constraint( company_id, allow_public) project_constraint = self._get_project_constraint( project_ids, include_subprojects) key_path = f"hyperparams.{ParameterKeyEscaper.escape(section)}.{ParameterKeyEscaper.escape(name)}" last_updated_task = (Task.objects( **company_constraint, **project_constraint, **{ f"{key_path.replace('.', '__')}__exists": True }, ).only("last_update").order_by("-last_update").limit(1).first()) if not last_updated_task: return 0, [] redis_key = f"hyperparam_values_{company_id}_{'_'.join(project_ids)}_{section}_{name}_{allow_public}" last_update = last_updated_task.last_update or datetime.utcnow() cached_res = self._get_cached_param_values( key=redis_key, last_update=last_update, allowed_delta_sec=config.get( "services.tasks.hyperparam_values.cache_allowed_outdate_sec", 60), ) if cached_res: return cached_res max_values = config.get("services.tasks.hyperparam_values.max_count", 100) pipeline = [ { "$match": { **company_constraint, **project_constraint, key_path: { "$exists": True }, } }, { "$project": { "value": f"${key_path}.value" } }, { "$group": { "_id": "$value" } }, { "$sort": { "_id": 1 } }, { "$limit": max_values }, { "$group": { "_id": 1, "total": { "$sum": 1 }, "results": { "$push": "$$ROOT._id" }, } }, ] result = next(Task.aggregate(pipeline, collation=Task._numeric_locale), None) if not result: return 0, [] total = int(result.get("total", 0)) values = result.get("results", []) ttl = config.get("services.tasks.hyperparam_values.cache_ttl_sec", 86400) cached = dict(last_update=last_update.timestamp(), total=total, values=values) self.redis.setex(redis_key, ttl, json.dumps(cached)) return total, values
def get_aggregated_project_parameters( company_id, project_ids: Sequence[str] = None, page: int = 0, page_size: int = 500, ) -> Tuple[int, int, Sequence[dict]]: page = max(0, page) page_size = max(1, page_size) pipeline = [ { "$match": { "company": { "$in": [None, "", company_id] }, "hyperparams": { "$exists": True, "$gt": {} }, **({ "project": { "$in": project_ids } } if project_ids else {}), } }, { "$project": { "sections": { "$objectToArray": "$hyperparams" } } }, { "$unwind": "$sections" }, { "$project": { "section": "$sections.k", "names": { "$objectToArray": "$sections.v" }, } }, { "$unwind": "$names" }, { "$group": { "_id": { "section": "$section", "name": "$names.k" } } }, { "$sort": OrderedDict({ "_id.section": 1, "_id.name": 1 }) }, { "$group": { "_id": 1, "total": { "$sum": 1 }, "results": { "$push": "$$ROOT" }, } }, { "$project": { "total": 1, "results": { "$slice": ["$results", page * page_size, page_size] }, } }, ] with translate_errors_context(): result = next(Task.aggregate(pipeline), None) total = 0 remaining = 0 results = [] if result: total = int(result.get("total", -1)) results = [{ "section": ParameterKeyEscaper.unescape(dpath.get(r, "_id/section")), "name": ParameterKeyEscaper.unescape(dpath.get(r, "_id/name")), } for r in result.get("results", [])] remaining = max(0, total - (len(results) + page * page_size)) return total, remaining, results
def get_unique_metric_variants(company_id, project_ids=None): pipeline = [ { "$match": dict( company={"$in": [None, "", company_id]}, **({ "project": { "$in": project_ids } } if project_ids else {}), ) }, { "$project": { "metrics": { "$objectToArray": "$last_metrics" } } }, { "$unwind": "$metrics" }, { "$project": { "metric": "$metrics.k", "variants": { "$objectToArray": "$metrics.v" }, } }, { "$unwind": "$variants" }, { "$group": { "_id": { "metric": "$variants.v.metric", "variant": "$variants.v.variant", }, "metrics": { "$addToSet": { "metric": "$variants.v.metric", "metric_hash": "$metric", "variant": "$variants.v.variant", "variant_hash": "$variants.k", } }, } }, { "$sort": OrderedDict({ "_id.metric": 1, "_id.variant": 1 }) }, ] with translate_errors_context(): result = Task.aggregate(pipeline) return [r["metrics"][0] for r in result]
def get_unique_metric_variants(company_id, project_ids: Sequence[str], include_subprojects: bool): if project_ids: if include_subprojects: project_ids = project_ids_with_children(project_ids) project_constraint = {"project": {"$in": project_ids}} else: project_constraint = {} pipeline = [ { "$match": dict( company={"$in": [None, "", company_id]}, **project_constraint, ) }, { "$project": { "metrics": { "$objectToArray": "$last_metrics" } } }, { "$unwind": "$metrics" }, { "$project": { "metric": "$metrics.k", "variants": { "$objectToArray": "$metrics.v" }, } }, { "$unwind": "$variants" }, { "$group": { "_id": { "metric": "$variants.v.metric", "variant": "$variants.v.variant", }, "metrics": { "$addToSet": { "metric": "$variants.v.metric", "metric_hash": "$metric", "variant": "$variants.v.variant", "variant_hash": "$variants.k", } }, } }, { "$sort": OrderedDict({ "_id.metric": 1, "_id.variant": 1 }) }, ] with translate_errors_context(): result = Task.aggregate(pipeline) return [r["metrics"][0] for r in result]
def get_project_stats( cls, company: str, project_ids: Sequence[str], specific_state: Optional[EntityVisibility] = None, include_children: bool = True, filter_: Mapping[str, Any] = None, ) -> Tuple[Dict[str, dict], Dict[str, dict]]: if not project_ids: return {}, {} child_projects = (_get_sub_projects(project_ids, _only=("id", "name")) if include_children else {}) project_ids_with_children = set(project_ids) | { c.id for c in itertools.chain.from_iterable(child_projects.values()) } status_count_pipeline, runtime_pipeline = cls.make_projects_get_all_pipelines( company, project_ids=list(project_ids_with_children), specific_state=specific_state, filter_=filter_, ) default_counts = dict.fromkeys(get_options(TaskStatus), 0) def set_default_count(entry): return dict(default_counts, **entry) status_count = defaultdict(lambda: {}) key = itemgetter(EntityVisibility.archived.value) for result in Task.aggregate(status_count_pipeline): for k, group in groupby(sorted(result["counts"], key=key), key): section = (EntityVisibility.archived if k else EntityVisibility.active).value status_count[result["_id"]][section] = set_default_count({ count_entry["status"]: count_entry["count"] for count_entry in group }) def sum_status_count(a: Mapping[str, Mapping], b: Mapping[str, Mapping]) -> Dict[str, dict]: return { section: { status: nested_get(a, (section, status), default=0) + nested_get(b, (section, status), default=0) for status in set(a.get(section, {})) | set(b.get(section, {})) } for section in set(a) | set(b) } status_count = cls.aggregate_project_data( func=sum_status_count, project_ids=project_ids, child_projects=child_projects, data=status_count, ) runtime = { result["_id"]: {k: v for k, v in result.items() if k != "_id"} for result in Task.aggregate(runtime_pipeline) } def sum_runtime(a: Mapping[str, Mapping], b: Mapping[str, Mapping]) -> Dict[str, dict]: return { section: a.get(section, 0) + b.get(section, 0) if not section.endswith("max_task_started") else max( a.get(section) or datetime.min, b.get(section) or datetime.min) for section in set(a) | set(b) } runtime = cls.aggregate_project_data( func=sum_runtime, project_ids=project_ids, child_projects=child_projects, data=runtime, ) def get_status_counts(project_id, section): project_runtime = runtime.get(project_id, {}) project_section_statuses = nested_get(status_count, (project_id, section), default=default_counts) def get_time_or_none(value): return value if value != datetime.min else None return { "status_count": project_section_statuses, "total_tasks": sum(project_section_statuses.values()), "total_runtime": project_runtime.get(section, 0), "completed_tasks_24h": project_runtime.get(f"{section}_recently_completed", 0), "last_task_run": get_time_or_none( project_runtime.get(f"{section}_max_task_started", datetime.min)), } report_for_states = [ s for s in cls.visibility_states if not specific_state or specific_state == s ] stats = { project: { task_state.value: get_status_counts(project, task_state.value) for task_state in report_for_states } for project in project_ids } children = { project: sorted( [{ "id": c.id, "name": c.name } for c in child_projects.get(project, [])], key=itemgetter("name"), ) for project in project_ids } return stats, children
def get_project_stats( cls, company: str, project_ids: Sequence[str], specific_state: Optional[EntityVisibility] = None, ) -> Tuple[Dict[str, dict], Dict[str, dict]]: if not project_ids: return {}, {} child_projects = _get_sub_projects(project_ids, _only=("id", "name")) project_ids_with_children = set(project_ids) | { c.id for c in itertools.chain.from_iterable(child_projects.values()) } status_count_pipeline, runtime_pipeline = cls.make_projects_get_all_pipelines( company, project_ids=list(project_ids_with_children), specific_state=specific_state, ) default_counts = dict.fromkeys(get_options(TaskStatus), 0) def set_default_count(entry): return dict(default_counts, **entry) status_count = defaultdict(lambda: {}) key = itemgetter(EntityVisibility.archived.value) for result in Task.aggregate(status_count_pipeline): for k, group in groupby(sorted(result["counts"], key=key), key): section = ( EntityVisibility.archived if k else EntityVisibility.active ).value status_count[result["_id"]][section] = set_default_count( { count_entry["status"]: count_entry["count"] for count_entry in group } ) def sum_status_count( a: Mapping[str, Mapping], b: Mapping[str, Mapping] ) -> Dict[str, dict]: return { section: { status: nested_get(a, (section, status), 0) + nested_get(b, (section, status), 0) for status in set(a.get(section, {})) | set(b.get(section, {})) } for section in set(a) | set(b) } status_count = cls.aggregate_project_data( func=sum_status_count, project_ids=project_ids, child_projects=child_projects, data=status_count, ) runtime = { result["_id"]: {k: v for k, v in result.items() if k != "_id"} for result in Task.aggregate(runtime_pipeline) } def sum_runtime( a: Mapping[str, Mapping], b: Mapping[str, Mapping] ) -> Dict[str, dict]: return { section: a.get(section, 0) + b.get(section, 0) for section in set(a) | set(b) } runtime = cls.aggregate_project_data( func=sum_runtime, project_ids=project_ids, child_projects=child_projects, data=runtime, ) def get_status_counts(project_id, section): return { "total_runtime": nested_get(runtime, (project_id, section), 0), "status_count": nested_get( status_count, (project_id, section), default_counts ), } report_for_states = [ s for s in EntityVisibility if not specific_state or specific_state == s ] stats = { project: { task_state.value: get_status_counts(project, task_state.value) for task_state in report_for_states } for project in project_ids } children = { project: sorted( [{"id": c.id, "name": c.name} for c in child_projects.get(project, [])], key=itemgetter("name"), ) for project in project_ids } return stats, children