Ejemplo n.º 1
0
def factor_aggregate_value_not_in_range(
        data_service: TopicDataService, rule: MonitorRule,
        date_range: Tuple[datetime, datetime],
        arithmetic: EntityColumnAggregateArithmetic) -> RuleResult:
    found, factor = find_factor(data_service, rule.factorId, rule)
    if not found:
        return RuleResult.IGNORED

    column_name = data_service.get_data_entity_helper().get_column_name(
        factor.name)
    data = data_service.find_straight_values(
        columns=[
            EntityStraightAggregateColumn(arithmetic=arithmetic,
                                          columnName=column_name)
        ],
        criteria=build_date_range_criteria(date_range))
    if len(data) == 0:
        # no data found
        return RuleResult.SUCCESS

    parsed, value = is_decimal(data[0].get(column_name))
    if not parsed:
        # not a decimal, cannot do comparison
        return RuleResult.FAILED

    passed = in_range(value, rule.params.min, rule.params.max)

    return RuleResult.SUCCESS if passed else RuleResult.FAILED
def exchange_topic_data_service(data_service: TopicDataService, topic_id: TopicId) -> TopicDataService:
	principal_service = data_service.get_principal_service()
	topic_service = get_topic_service(principal_service)
	topic = topic_service.find_by_id(topic_id)
	if topic is None:
		raise DqcException(f'Topic[id={topic_id}] not found.')
	schema = topic_service.find_schema_by_name(topic.name, principal_service.get_tenant_id())
	if schema is None:
		raise DqcException(f'Topic[name={topic.name}] not found.')
	storage = ask_topic_storage(schema, principal_service)
	return ask_topic_data_service(schema, storage, data_service.get_principal_service())
Ejemplo n.º 3
0
def run_retrieve_all_data_rules(
        data_service: TopicDataService, rules: List[MonitorRule],
        date_range: Tuple[datetime,
                          datetime], changed_rows_count_in_range: int,
        total_rows_count: int) -> List[Tuple[MonitorRule, RuleResult]]:
    """
	run rules which should retrieve all data,
	make sure pass-in rules are qualified, will not check them inside
	"""
    rules_by_factor = group_rules_by_factor(rules)
    factors = find_factors_and_log_missed(data_service, rules_by_factor)

    data_entity_helper = data_service.get_data_entity_helper()
    column_names = ArrayHelper(factors).map(
        lambda x: data_entity_helper.get_column_name(x.name)).to_list()
    rows = data_service.find_distinct_values(
        criteria=build_date_range_criteria(date_range),
        column_names=column_names,
        distinct_value_on_single_column=True)

    # deal with data
    # cast values to decimal since all rules are deal with numbers
    # value cannot be cast, will be treated as 0
    def translate_to_array(data_rows: List[Dict[str, Any]],
                           factor: Factor) -> List[List[Any]]:
        return ArrayHelper(data_rows) \
         .map(lambda x: x.get(factor.name)) \
         .map(lambda value: is_decimal(value)) \
         .filter(lambda x: x[1] if x[0] else 0) \
         .map(lambda x: [x]) \
         .to_list()

    def run_rules(factor: Factor,
                  data: List[Any]) -> List[Tuple[MonitorRule, RuleResult]]:
        concerned_rules = rules_by_factor.get(factor.factorId)
        if concerned_rules is None or len(concerned_rules) == 0:
            return []

        def run_rule(rule: MonitorRule) -> Tuple[MonitorRule, RuleResult]:
            result = retrieve_all_data_rules_map[rule.code](
                data_service, factor, data, rule, date_range,
                changed_rows_count_in_range, total_rows_count)
            return rule, result

        return ArrayHelper(concerned_rules).map(run_rule).to_list()

    return ArrayHelper(factors) \
     .map(lambda x: (x, translate_to_array(rows, x))) \
     .map(lambda x: run_rules(x[0], x[1])) \
     .reduce(lambda all_results, x: [*all_results, *x], [])
def factor_string_length_not_in_range(data_service: TopicDataService,
                                      rule: MonitorRule,
                                      date_range: Tuple[datetime, datetime],
                                      changed_rows_count_in_range: int,
                                      total_rows_count: int) -> RuleResult:
    found, factor = find_factor(data_service, rule.factorId, rule)
    if not found:
        return RuleResult.IGNORED

    count = data_service.count_by_criteria([
        EntityCriteriaJoint(
            conjunction=EntityCriteriaJointConjunction.OR,
            children=[
                EntityCriteriaExpression(
                    left=ComputedLiteral(
                        operator=ComputedLiteralOperator.CHAR_LENGTH,
                        elements=[
                            build_column_name_literal(factor, data_service)
                        ]),
                    operator=EntityCriteriaOperator.LESS_THAN,
                    right=rule.params.min),
                EntityCriteriaExpression(
                    left=ComputedLiteral(
                        operator=ComputedLiteralOperator.CHAR_LENGTH,
                        elements=[
                            build_column_name_literal(factor, data_service)
                        ]),
                    operator=EntityCriteriaOperator.GREATER_THAN,
                    right=rule.params.max)
            ]), *build_date_range_criteria(date_range)
    ])

    return RuleResult.SUCCESS if count == 0 else RuleResult.FAILED
Ejemplo n.º 5
0
def find_task_rows(process_date: date, scheduler: TopicSnapshotScheduler,
                   source_topic_schema: TopicSchema,
                   source_topic_service: TopicDataService,
                   principal_service: PrincipalService) -> List[int]:
    if scheduler.filter is None or scheduler.filter.filters is None or len(
            scheduler.filter.filters) == 0:
        rows = source_topic_service.find_distinct_values(
            None, [TopicDataColumnNames.ID.value], False)
    else:
        parsed_criteria = parse_condition_for_storage(scheduler.filter,
                                                      [source_topic_schema],
                                                      principal_service, True)
        variables = build_variables(process_date, scheduler.frequency)
        rows = source_topic_service.find_distinct_values(
            [parsed_criteria.run(variables, principal_service)],
            [TopicDataColumnNames.ID.value], False)
    return ArrayHelper(rows).map(
        lambda x: x.get(TopicDataColumnNames.ID.value)).to_list()
Ejemplo n.º 6
0
def factor_value_assert(
    data_service: TopicDataService, rule: MonitorRule,
    date_range: Tuple[datetime, datetime],
    assert_expression: Callable[[Factor],
                                EntityCriteriaExpression]) -> RuleResult:
    found, factor = find_factor(data_service, rule.factorId, rule)
    if not found:
        return RuleResult.IGNORED

    count = data_service.count_by_criteria(
        [assert_expression(factor), *build_date_range_criteria(date_range)])

    return RuleResult.SUCCESS if count == 0 else RuleResult.FAILED
def find_factor(
		data_service: TopicDataService, factor_id: Optional[FactorId],
		rule: MonitorRule) -> Tuple[bool, Optional[Factor]]:
	if is_blank(factor_id):
		logger.error(f'Factor id not declared on rule[{rule.dict()}].')
		return False, None
	topic = data_service.get_topic()
	factor = ArrayHelper(topic.factors).find(lambda x: x.factorId == factor_id)
	if factor is None:
		logger.error(f'Factor[id={factor_id}] on rule[{rule.dict()}] not found.')
		return False, None
	else:
		return True, factor
Ejemplo n.º 8
0
def run_retrieve_distinct_data_rules(
        data_service: TopicDataService, rules: List[MonitorRule],
        date_range: Tuple[datetime,
                          datetime], changed_rows_count_in_range: int,
        total_rows_count: int) -> List[Tuple[MonitorRule, RuleResult]]:
    """
	run rules which should retrieve distinct data and count,
	make sure pass-in rules are qualified, will not check them inside
	"""
    rules_by_factor = group_rules_by_factor(rules)
    factors = find_factors_and_log_missed(data_service, rules_by_factor)

    data_entity_helper = data_service.get_data_entity_helper()

    # deal with data
    def translate_to_array(data_rows: List[Dict[str, Any]],
                           factor: Factor) -> List[Tuple[Any, int]]:
        column_name = data_entity_helper.get_column_name(factor.name)
        return ArrayHelper(data_rows).map(
            lambda x: (x.get(column_name), x.get('count'))).to_list()

    def run_rules(factor: Factor) -> List[Tuple[MonitorRule, RuleResult]]:
        concerned_rules = rules_by_factor.get(factor.factorId)
        if concerned_rules is None or len(concerned_rules) == 0:
            return []

        # retrieve data,
        rows = data_service.find_straight_values(
            criteria=build_date_range_criteria(date_range),
            columns=[
                EntityStraightAggregateColumn(
                    arithmetic=EntityColumnAggregateArithmetic.COUNT,
                    columnName=data_entity_helper.get_column_name(factor.name),
                    alias='count'),
                EntityStraightAggregateColumn(
                    columnName=data_entity_helper.get_column_name(factor.name))
            ])
        data = translate_to_array(rows, factor)

        def run_rule(rule: MonitorRule) -> Tuple[MonitorRule, RuleResult]:
            result = retrieve_distinct_data_rules_map[rule.code](
                data_service, factor, data, rule, date_range,
                changed_rows_count_in_range, total_rows_count)
            return rule, result

        return ArrayHelper(concerned_rules).map(run_rule).to_list()

    return ArrayHelper(factors).map(lambda x: run_rules(x)) \
     .reduce(lambda all_results, x: [*all_results, *x], [])
def factor_mismatch_type(data_service: TopicDataService, rule: MonitorRule,
                         date_range: Tuple[datetime, datetime],
                         changed_rows_count_in_range: int,
                         total_rows_count: int) -> RuleResult:
    found, factor = find_factor(data_service, rule.factorId, rule)
    if not found:
        return RuleResult.IGNORED

    should, criteria = build_mismatch_statement(factor, data_service)
    if not should:
        # not need to detect, ignored
        return RuleResult.IGNORED

    count = data_service.count_by_criteria(
        [*criteria, *build_date_range_criteria(date_range)])

    return RuleResult.SUCCESS if count == 0 else RuleResult.FAILED
Ejemplo n.º 10
0
def rows_count_mismatch_with_another(data_service: TopicDataService,
                                     rule: Optional[MonitorRule],
                                     date_range: Tuple[datetime, datetime],
                                     has_data: bool) -> int:
    """
	if given count is not none, which means already find the count somewhere, simply use this count as current.
	anyway, returns the current count
	"""
    if has_data:
        # get count of changed rows of current topic
        changed_row_count = data_service.count_by_criteria(
            build_date_range_criteria(date_range))
    else:
        changed_row_count = 0

    do_it(data_service, rule, date_range, changed_row_count)

    return changed_row_count
def factor_empty_over_coverage(
		data_service: TopicDataService, rule: MonitorRule,
		date_range: Tuple[datetime, datetime],
		changed_rows_count_in_range: int, total_rows_count: int
) -> RuleResult:
	if total_rows_count == 0:
		return RuleResult.SUCCESS
	found, factor = find_factor(data_service, rule.factorId, rule)
	if not found:
		return RuleResult.IGNORED

	count = data_service.count_by_criteria([
		EntityCriteriaExpression(
			left=build_column_name_literal(factor, data_service),
			operator=EntityCriteriaOperator.IS_EMPTY
		)
	])
	rate = count / total_rows_count * 100
	return RuleResult.SUCCESS if rate > rule.params.coverageRate else RuleResult.FAILED
Ejemplo n.º 12
0
def factor_string_length_mismatch(data_service: TopicDataService,
                                  rule: MonitorRule,
                                  date_range: Tuple[datetime, datetime],
                                  changed_rows_count_in_range: int,
                                  total_rows_count: int) -> RuleResult:
    found, factor = find_factor(data_service, rule.factorId, rule)
    if not found:
        return RuleResult.IGNORED

    count = data_service.count_by_criteria([
        EntityCriteriaExpression(left=ComputedLiteral(
            operator=ComputedLiteralOperator.CHAR_LENGTH,
            elements=[build_column_name_literal(factor, data_service)]),
                                 operator=EntityCriteriaOperator.NOT_EQUALS,
                                 right=rule.params.length),
        *build_date_range_criteria(date_range)
    ])

    return RuleResult.SUCCESS if count == 0 else RuleResult.FAILED
Ejemplo n.º 13
0
def do_it(data_service: TopicDataService, rule: Optional[MonitorRule],
          date_range: Tuple[datetime,
                            datetime], changed_row_count: int) -> None:
    if rule is None:
        return
    # get count of changed rows of another topic
    another_topic_id = rule.params.topicId
    if is_blank(another_topic_id):
        logger.error(f'Another topic id not declared on rule[{rule.dict()}].')
        return

    another_data_service = exchange_topic_data_service(data_service,
                                                       another_topic_id)
    changed_row_count_of_another = another_data_service.count_by_criteria(
        build_date_range_criteria(date_range))

    trigger(
        rule,
        RuleResult.FAILED if changed_row_count != changed_row_count_of_another
        else RuleResult.SUCCESS, date_range[0],
        data_service.get_principal_service())
def factor_and_another(
		data_service: TopicDataService, rule: MonitorRule,
		date_range: Tuple[datetime, datetime],
		changed_rows_count_in_range: int, total_rows_count: int
) -> RuleResult:
	found, factor = find_factor(data_service, rule.factorId, rule)
	if not found:
		return RuleResult.IGNORED

	found, another_factor = find_factor(data_service, rule.params.factorId, rule)
	if not found:
		return RuleResult.IGNORED

	count = data_service.count_by_criteria([
		EntityCriteriaExpression(
			left=build_column_name_literal(factor, data_service),
			operator=EntityCriteriaOperator.NOT_EQUALS,
			right=build_column_name_literal(another_factor, data_service),
		),
		*build_date_range_criteria(date_range)
	])

	return RuleResult.SUCCESS if count == 0 else RuleResult.FAILED
def build_column_name_literal(factor: Factor, data_service: TopicDataService) -> ColumnNameLiteral:
	return ColumnNameLiteral(columnName=data_service.get_data_entity_helper().get_column_name(factor.name))