def get_teams_schema(ds, **kwargs): """ This task lists the last config for every team. Then a Neo4j query is done to count the nodes of each label. """ with kwargs["params"]["app"].app_context(): from depc.controllers.configs import ConfigController # Get all configs ordered by -date configs = ConfigController._list(order_by="updated_at", reverse=True) # Get the last config by team teams = {} for config in configs: team = config.team # For each team if team.kafka_topic not in teams.keys(): logger.info("[{0}] Configuration : {1}".format(team.name, config.data)) data = { "id": str(team.id), "name": team.name, "topic": team.kafka_topic, "schema": config.data, "labels": {}, } # Count number of nodes per label logger.info( "[{0}] Counting nodes for {1} labels...".format( team.name, len(config.data.keys()) ) ) for label in config.data.keys(): neo_key = "{}_{}".format(team.kafka_topic, label) records = get_records( "MATCH (n:{label}) RETURN count(n) AS Count".format( label=neo_key ) ) count = list(records)[0].get("Count") logger.info( "[{0}] {1} nodes for label {2}...".format( team.name, count, label ) ) data["labels"][label] = count teams[team.kafka_topic] = data # Save the config into an Airflow variable Variable.set("config", list(teams.values()), serialize_json=True)
def get_impacted_nodes_all(cls, team_id, label, node, impacted_label, ts, with_inactive_nodes): try: ts = int(ts) except (TypeError, ValueError): raise IntegrityError("'ts' parameter must be a positive number") if ts < 0: raise IntegrityError("'ts' parameter must be a positive number") if not impacted_label: raise IntegrityError("'impactedLabel' parameter must not be empty") team = TeamController._get({"Team": {"id": team_id}}) json_string = "[" nodes_batch = 50000 skip = 0 total_count = cls.get_impacted_nodes_count(team_id, label, node, impacted_label)["count"] # Load all impacted nodes data by batch inside a list while skip < total_count: query = cls._build_impacted_nodes_queries( topic=team.kafka_topic, label=label, node=node, impacted_label=impacted_label, skip=skip, limit=nodes_batch, count=False, ) new_json_string_data = json.dumps( cls._compute_impacted_nodes_from_data( get_records(query).data(), ts, with_inactive_nodes=with_inactive_nodes, ), indent=4, ) skip += nodes_batch # if this is not the last loop, add a "," to the last element # of the array to help easy concatenation at the next loop if skip < total_count: new_json_string_data = new_json_string_data[:-2] + ",\n]" json_string += new_json_string_data[1:-2] json_string += "\n]" return json_string
def skip_by_requirement(app, request): """Skip a test if the requirement is not met. Supporting argument: "neo4j" Usage: >>> @pytest.mark.skip_requirement('neo4j') >>> def test_get_team_dependencies(client): >>> pass """ if request.node.get_closest_marker('skip_requirement'): # Skip test if neo4j is not running (or not well configured) if request.node.get_closest_marker('skip_requirement').args[0] == 'neo4j': with app.app_context(): try: get_records("RETURN 1") except ServiceUnavailable as e: pytest.skip("Neo4j server error : {}".format(e)) except AuthError as e: pytest.skip("Neo4j authentication error : {}".format(e))
def get_impacted_nodes_count(cls, team_id, label, node, impacted_label): if not impacted_label: raise IntegrityError("'impactedLabel' parameter must not be empty") team = TeamController._get({"Team": {"id": team_id}}) query = cls._build_impacted_nodes_queries( topic=team.kafka_topic, label=label, node=node, impacted_label=impacted_label, count=True, ) results = get_records(query) return {"count": results.value()[0]}
def get_impacted_nodes(cls, team_id, label, node, impacted_label, skip, limit, ts): try: skip = int(skip) limit = int(limit) ts = int(ts) except (TypeError, ValueError): raise IntegrityError( "'skip', 'limit' and 'ts' parameters must be positive numbers") if skip < 0 or limit < 0 or ts < 0: raise IntegrityError( "'skip', 'limit' and 'ts' parameters must be positive numbers") if not impacted_label: raise IntegrityError("'impactedLabel' parameter must not be empty") team = TeamController._get({"Team": {"id": team_id}}) query = cls._build_impacted_nodes_queries( topic=team.kafka_topic, label=label, node=node, impacted_label=impacted_label, skip=skip, limit=limit, count=False, ) results = get_records(query) impacted_nodes_data = results.data() # Return all impacted nodes (active and inactive) with metadata indicating if they are active or not return cls._compute_impacted_nodes_from_data(impacted_nodes_data, ts, with_inactive_nodes=True)
def get_node_dependencies( cls, team_id, label, node, day=None, filter_on_config=False, include_inactive=False, ): topic = TeamController._get({"Team": {"id": team_id}}).kafka_topic query = cls._build_dependencies_query(team_id, topic, label, node, filter_on_config) dependencies = { "dependencies": {}, "graph": { "nodes": [], "relationships": [] } } records = get_records(query) # Loop on all relationships for idx, record in enumerate(records): # Handle the main node if idx == 0: node = record.get("n") title = list(node.labels)[0][len(topic) + 1:] if title not in dependencies["dependencies"]: dependencies["dependencies"][title] = [] dependencies["dependencies"][title].append(dict(node.items())) dependencies["graph"]["nodes"].append({ "id": node.id, "label": dict(node.items())["name"], "title": title }) # Handle the relationship rel = record.get("r") if not rel: continue # Check inactive nodes start_node = rel.start_node end_node = rel.end_node start = arrow.get(day, "YYYY-MM-DD").floor("day").timestamp end = arrow.get(day, "YYYY-MM-DD").ceil("day").timestamp if (not is_active_node(start, end, end_node)) or (not has_active_relationship( start, end, rel.get("periods"))): if not include_inactive: continue else: setattr(end_node, "inactive", True) # The label is 'acme_Mylabel', we just want 'Mylabel' title = list(end_node.labels)[0][len(topic) + 1:] if title not in dependencies["dependencies"]: dependencies["dependencies"][title] = [] dependencies["dependencies"][title].append({ **dict(end_node.items()), **{ "periods": list(rel.get("periods")), "inactive": getattr(end_node, "inactive", False), }, }) dependencies["graph"]["nodes"].append({ "id": end_node.id, "label": dict(end_node.items())["name"], "title": title, }) dependencies["graph"]["relationships"].append({ "id": rel.id, "from": start_node.id, "to": end_node.id, "arrows": "to", "periods": list(rel.get("periods")), }) return dependencies
def execute(self, context): from depc.controllers import NotFoundError from depc.controllers.rules import RuleController from depc.extensions import redis_scheduler as redis from depc.utils import get_start_end_ts ds = context["ds"] start, end = get_start_end_ts(ds) with self.app.app_context(): # Get the nodes for this team and this label query = ("MATCH(n:{label}) RETURN n AS Node " "ORDER BY Node.name " "SKIP {skip} LIMIT {limit}") query = query.format(label=self.full_label, skip=self.skip, limit=int(self.length)) records = get_records(query) nodes = [dict(record.get("Node").items()) for record in records] # Remove old nodes nodes = [n for n in nodes if is_active_node(start, end, n)] # Get the rule associated to the label for this team try: rule = RuleController.get(filters={ "Rule": { "name": self.rule_name, "team_id": self.team_id } }) except NotFoundError: self.log.warning( "[{0}] The label {1} has no associated rule in DEPC". format(self.team_name, self.label)) return False has_qos = False auto_fill = check_enable_auto_fill(rule["id"], self.team_id) for node in nodes: result = RuleController.execute( rule_id=rule["id"], auto_fill=auto_fill, name=node["name"], start=start, end=end, ) if result["qos"]["qos"] != "unknown": has_qos = True self.log.info("[{0}/{1}] The QOS of {2} is {3}%".format( self.team_name, self.label, node["name"], result["qos"]["qos"], )) # Saving to Beamium self.write_metric( metric="depc.qos.node", ts=start, value=result["qos"]["qos"], tags={ "label": self.label, "name": node["name"], "team": self.team_id, }, ) # Used for average computing key = "{ds}.{team}.{label}".format(ds=ds, team=self.team_name, label=self.label) if not self.excluded_from_label_average( self.team_name, self.label, node["name"]): redis.zadd("{}.sorted".format(key), node["name"], result["qos"]["qos"]) # Save information to reuse it later (`bools_dps` is used in # OperationOperator and `qos` is used in AggregationOperator) redis.set( "{}.{}.node".format(key, node["name"]), json.dumps({ "bools_dps": result["qos"]["bools_dps"], "qos": result["qos"]["qos"], }), ) else: self.log.warning("[{0}/{1}] No QOS for {2}".format( self.team_name, self.label, node["name"])) # Add it in redis to compute some stats in AfterSubdagOperator redis.sadd( "{ds}.{team}.{label}.noqos".format(ds=ds, team=self.team_name, label=self.label), node["name"], ) if not has_qos: self.log.warning("[{0}/{1}] No QOS found for any items".format( self.team_name, self.label))
def execute(self, context): from depc.extensions import redis_scheduler as redis from depc.utils import get_start_end_ts ds = context["ds"] start, end = get_start_end_ts(ds) name, dependencies, query = self.build_query() self.log.info( "[{team}/{label}] Fetching nodes and its dependencies using the following query : {query}" .format(team=self.team_name, label=self.label, query=query)) # Retrieve the node and its dependencies start_time = time.time() with self.app.app_context(): records = get_records(query) nodes = self.filter_records( start=start, end=end, records=[r for r in records], name=name, dependencies=dependencies, ) # No node has dependency if not nodes: self.log.warning("[{team}/{label}] No node has dependency.".format( team=self.team_name, label=self.label)) return self.log.info( "[{team}/{label}] Nodes fetched in {t}s, processing it...".format( team=self.team_name, label=self.label, t=round(time.time() - start_time, 3), )) # Process the nodes and remove the archived ones start_time = time.time() msg = "[{team}/{label}] Processing done in {t}s, {count} nodes returned (from {begin} to {end})" self.log.info( msg.format( team=self.team_name, label=self.label, t=round(time.time() - start_time, 3), count=len(nodes), begin=list(nodes.keys())[0], end=list(nodes.keys())[-1], )) self.log.info( "[{team}/{label}] Computing the QOS for {count} nodes...".format( team=self.team_name, label=self.label, count=len(nodes))) start_time = time.time() QOS = {} metrics = [] nodes_without_qos = [] idx = 0 for node, deps in nodes.items(): self.log.info( "[{team}/{label}] Fetching the QOS of {count} dependencies for {node}..." .format(team=self.team_name, label=self.label, count=len(deps), node=node)) node_deps = [] for d in deps: dep_name = d["name"] dep_label = d["label"] # The label contains the topic but not the redis key dep = "{0}.{1}".format(dep_label.split("_")[1], dep_name) # It's the first time we see this dependency if dep not in QOS.keys(): # We retrieve its QOS in Redis qos = redis.get("{ds}.{team}.{dep}.node".format( ds=ds, team=self.team_name, dep=dep)) if qos: QOS[dep] = json.loads(qos.decode("utf-8"), cls=BoolsDpsDecoder) # Add the result of the dependencies for this node try: node_deps.append(QOS[dep]) except KeyError: msg = ("The QOS of {dep} is not available " "(no data in any metric ?)".format(dep=dep_name)) logger.warning(msg) if node_deps: msg = ( "[{team}/{label}] Computing the QOS of {node} using a {type} " "between {count} dependencies with valid QOS...") self.log.info( msg.format( team=self.team_name, label=self.label, node=node, type=self.type, count=len(node_deps), )) node_qos = self.compute_node_qos(data=node_deps, start=start, end=end) self.log.info("[{0}/{1}] The QOS of {2} is {3}%".format( self.team_name, self.label, node, node_qos["qos"])) metrics.append( self.format_metric( metric="depc.qos.node", ts=start, value=node_qos["qos"], tags={ "label": self.label, "name": node, "team": self.team_id }, )) key = "{ds}.{team}.{label}".format(ds=ds, team=self.team_name, label=self.label) if not self.excluded_from_label_average( self.team_name, self.label, node): redis.zadd("{}.sorted".format(key), node, node_qos["qos"]) # Save information to reuse it later (`bools_dps` is used in # OperationOperator and `qos` is used in AggregationOperator) redis.set("{}.{}.node".format(key, node), json.dumps(node_qos)) else: self.log.warning( "[{team}/{label}] {node} has no dependency with QOS". format(team=self.team_name, label=self.label, node=node)) nodes_without_qos.append(node) # Add it in redis to compute some stats in AfterSubdagOperator redis.sadd( "{ds}.{team}.{label}.noqos".format(ds=ds, team=self.team_name, label=self.label), node, ) idx += 1 if idx and idx % 1000 == 0: self.log.info( "[{team}/{label}] {count} nodes processed in {time}s". format( team=self.team_name, label=self.label, count=idx, time=round(time.time() - start_time, 3), )) self.log.info( "[{team}/{label}] The QOS of {count} nodes has been computed in {time}s" .format( team=self.team_name, label=self.label, count=len(metrics), time=round(time.time() - start_time, 3), )) if nodes_without_qos: msg = "[{team}/{label}] The QOS could not be found for {count} nodes ({excerpt}, ...)" self.log.warning( msg.format( team=self.team_name, label=self.label, count=len(nodes_without_qos), excerpt=", ".join(nodes_without_qos[:5]), )) # Write metrics for Beamium if not metrics: self.log.warning( "[{team}/{label}] No QOS to save, chunk is finished.".format( team=self.team_name, label=self.label)) else: self.write_metrics(metrics)