def test_create_rule_using_org_id_to_establish_dq_check_relationship(self): # Ensure no address_line_2 rules exist by default beforehand dq = DataQualityCheck.retrieve(self.org.id) self.assertEqual(0, dq.rules.filter(field='address_line_2').count()) base_rule_info = { 'field': 'address_line_2', 'table_name': 'PropertyState', 'enabled': True, 'data_type': Rule.TYPE_STRING, 'rule_type': Rule.RULE_TYPE_DEFAULT, 'condition': Rule.RULE_INCLUDE, 'required': False, 'not_null': False, 'min': None, 'max': None, 'text_match': 'some random text', 'severity': Rule.SEVERITY_ERROR, 'units': "", 'status_label': None, } url = reverse('api:v3:data_quality_check-rules-list', kwargs={'nested_organization_id': self.org.id}) self.client.post(url, content_type='application/json', data=json.dumps(base_rule_info)) dq = DataQualityCheck.retrieve(self.org.id) self.assertEqual(1, dq.rules.filter(field='address_line_2').count())
def create_organization(user=None, org_name='', *args, **kwargs): """ Helper script to create a user/org relationship from scratch. This is heavily used and creates the default labels, columns, and data quality rules when a new organization is created :param user: user inst. :param org_name: str, name of Organization we'd like to create. :param (optional) kwargs: 'role', int; 'status', str. """ from seed.models import StatusLabel as Label organization_user = None user_added = False organization = Organization.objects.create(name=org_name) if user: organization_user, user_added = OrganizationUser.objects.get_or_create( user=user, organization=organization) for label in Label.DEFAULT_LABELS: Label.objects.get_or_create( name=label, super_organization=organization, defaults={'color': 'blue'}, ) # upon initializing a new organization (SuperOrganization), create # the default columns _create_default_columns(organization.id) # create the default rules for this organization DataQualityCheck.retrieve(organization.id) return organization, organization_user, user_added
def test_ensure_default_rules(self): dq = DataQualityCheck.retrieve(self.org) initial_pk = dq.pk self.assertEqual(dq.rules.count(), len(DEFAULT_RULES)) self.assertEqual(dq.results, {}) self.assertEqual(initial_pk, dq.pk) # check again to make sure that it does not append more rules to the same org dq = DataQualityCheck.retrieve(self.org.pk) self.assertEqual(dq.rules.count(), len(DEFAULT_RULES))
def get_queryset(self): # Handle the anonymous case (e.g. Swagger page load) if not self.kwargs: return Rule.objects.none() org_id = self.kwargs.get('nested_organization_id') rule_id = self.kwargs.get('pk') if rule_id is None: return DataQualityCheck.retrieve(org_id).rules.all() else: return DataQualityCheck.retrieve(org_id).rules.filter(id=rule_id)
def test_multiple_data_quality_check_objects(self): dq = DataQualityCheck.retrieve(self.org) self.assertEqual(dq.name, 'Default Data Quality Check') DataQualityCheck.objects.create(organization=self.org, name='test manual creation') DataQualityCheck.objects.create(organization=self.org, name='test manual creation 2') DataQualityCheck.objects.create(organization=self.org, name='test manual creation 3') dq = DataQualityCheck.retrieve(self.org) # The method above will delete the multiple objects and return the original self.assertEqual(dq.name, 'Default Data Quality Check')
def test_rule_with_label(self): dq = DataQualityCheck.retrieve(self.org) rules = dq.rules.filter(status_label__isnull=False) self.assertEqual(rules.count(), 0) sl_data = { 'name': 'test label on rule', 'super_organization': self.org } status_label, _ = StatusLabel.objects.get_or_create(**sl_data) sls = StatusLabel.objects.filter(**sl_data) self.assertEqual(sls.count(), 1) new_rule = { 'table_name': 'PropertyState', 'field': 'conditioned_floor_area', 'data_type': TYPE_NUMBER, 'rule_type': RULE_TYPE_DEFAULT, 'min': 0, 'max': 7000000, 'severity': SEVERITY_ERROR, 'units': 'square feet', 'status_label': status_label } dq.add_rule(new_rule) rules = dq.rules.filter(status_label__isnull=False) self.assertEqual(rules.count(), 1) self.assertEqual(rules[0].status_label, status_label) # delete the rule but make sure that the label does not get deleted dq.remove_all_rules() sls = StatusLabel.objects.filter(**sl_data) self.assertEqual(sls.count(), 1)
def test_reset_default_rules(self): dq = DataQualityCheck.retrieve(self.org) new_rule = { 'table_name': 'PropertyState', 'field': 'test_floor_area', 'data_type': TYPE_NUMBER, 'rule_type': RULE_TYPE_DEFAULT, 'min': 0, 'max': 7000000, 'severity': SEVERITY_ERROR, 'units': 'square feet' } dq.add_rule(new_rule) self.assertEqual(dq.rules.count(), len(DEFAULT_RULES) + 1) # change one of the default rules rule = dq.rules.filter(field='gross_floor_area').first() rule.min = -10000 rule.save() self.assertEqual( dq.rules.filter(field='gross_floor_area').first().min, -10000) dq.reset_default_rules() self.assertEqual( dq.rules.filter(field='gross_floor_area').first().min, 100) # ensure non-default rule still exists non_def_rules = dq.rules.filter(field='test_floor_area') self.assertEqual(non_def_rules.count(), 1)
def test_check_property_state_example_data_with_labels(self): dq = DataQualityCheck.retrieve(self.org.id) # Create labels and apply them to the rules being triggered later site_eui_label = StatusLabel.objects.create(name='Check Site EUI', super_organization=self.org) site_eui_rule = dq.rules.get(table_name='PropertyState', field='site_eui', max='1000') site_eui_rule.status_label = site_eui_label site_eui_rule.save() year_built_label = StatusLabel.objects.create(name='Check Year Built', super_organization=self.org) year_built_rule = dq.rules.get(table_name='PropertyState', field='year_built') year_built_rule.status_label = year_built_label year_built_rule.save() # Create state and associate it to view ps_data = { 'no_default_data': True, 'custom_id_1': 'abcd', 'address_line_1': '742 Evergreen Terrace', 'pm_property_id': 'PMID', 'site_eui': 525600, 'year_built': 1699, } ps = self.property_state_factory.get_property_state(None, **ps_data) property = self.property_factory.get_property() PropertyView.objects.create( property=property, cycle=self.cycle, state=ps ) dq.check_data(ps.__class__.__name__, [ps]) dq_results = dq.results[ps.id]['data_quality_results'] labels = [r['label'] for r in dq_results] self.assertCountEqual(['Check Site EUI', 'Check Year Built'], labels)
def test_rule_with_label_set_to_null(self): dq = DataQualityCheck.retrieve(self.org) sl_data = { 'name': 'test label on rule for null', 'super_organization': self.org } status_label, _ = StatusLabel.objects.get_or_create(**sl_data) new_rule = { 'name': 'Name not to be forgotten', 'table_name': 'PropertyState', 'field': 'conditioned_floor_area', 'data_type': TYPE_NUMBER, 'rule_type': RULE_TYPE_DEFAULT, 'min': 0, 'max': 7000000, 'severity': SEVERITY_ERROR, 'units': 'square feet', 'status_label': status_label } dq.add_rule(new_rule) # Find data rule that has the status rule (from above) rules = dq.rules.filter(status_label__isnull=False) self.assertEqual(rules.count(), 1) self.assertEqual(rules[0].status_label, status_label) status_label.delete()
def test_property_state_quality(self): # Import the file and run mapping qs = PropertyState.objects.filter( import_file=self.import_file, ).iterator() d = DataQualityCheck.retrieve(self.org) d.check_data('PropertyState', qs) self.assertEqual(len(d.results), 7) result = d.retrieve_result_by_address('95373 E Peach Avenue') self.assertTrue(result['address_line_1'], '95373 E Peach Avenue') res = [{ "severity": "error", "value": "", "field": "pm_property_id", "table_name": "PropertyState", "message": "PM Property ID is null", "detailed_message": "PM Property ID is null", "formatted_field": "PM Property ID" }] self.assertEqual(res, result['data_quality_results']) result = d.retrieve_result_by_address('120243 E True Lane') res = [{ "severity": "error", "value": "10000000000.0", "field": "gross_floor_area", "table_name": "PropertyState", "message": "Gross Floor Area out of range", "detailed_message": "Gross Floor Area [10000000000.0] > 7000000.0", "formatted_field": "Gross Floor Area" }, { "severity": "error", "value": "0", "field": "year_built", "table_name": "PropertyState", "message": "Year Built out of range", "detailed_message": "Year Built [0] < 1700", "formatted_field": "Year Built" }, { "severity": "error", "value": "", "field": "custom_id_1", "table_name": "PropertyState", "message": "Custom ID 1 (Property) is null", "detailed_message": "Custom ID 1 (Property) is null", "formatted_field": "Custom ID 1 (Property)" }, { "severity": "error", "value": "", "field": "pm_property_id", "table_name": "PropertyState", "message": "PM Property ID is null", "detailed_message": "PM Property ID is null", "formatted_field": "PM Property ID" }] self.assertItemsEqual(res, result['data_quality_results']) result = d.retrieve_result_by_address('1234 Peach Tree Avenue') self.assertEqual(result, None)
def reset_default_data_quality_rules(self, request): """ Resets an organization's data data_quality rules --- parameters: - name: organization_id description: Organization ID type: integer required: true paramType: query type: status: type: string description: success or error required: true in_range_checking: type: array[string] required: true description: An array of in-range error rules missing_matching_field: type: array[string] required: true description: An array of fields to verify existence missing_values: type: array[string] required: true description: An array of fields to ignore missing values """ organization = Organization.objects.get( pk=request.query_params['organization_id']) dq = DataQualityCheck.retrieve(organization.id) dq.reset_default_rules() return self.data_quality_rules(request)
def test_add_new_rule_exception(self): dq = DataQualityCheck.retrieve(self.org) new_rule = {'wrong': 'data'} with self.assertRaisesRegexp( TypeError, "Rule data is not defined correctly: 'wrong' is an invalid keyword argument for this function" ): dq.add_rule(new_rule)
def create(self, validated_data): # For now, use an Org ID to find the DQ Check ID to apply (later, use the DQ Check ID directly) org_id = self.context['request'].parser_context['kwargs'][ 'nested_organization_id'] validated_data['data_quality_check_id'] = DataQualityCheck.retrieve( org_id).id return Rule.objects.create(**validated_data)
def test_update_rule_include_empty_text_match_validation(self): # Start with 1 Rule dq = DataQualityCheck.retrieve(self.org.id) dq.remove_all_rules() base_rule_info = { 'field': 'address_line_1', 'table_name': 'PropertyState', 'enabled': True, 'data_type': Rule.TYPE_STRING, 'rule_type': Rule.RULE_TYPE_DEFAULT, 'condition': Rule.RULE_INCLUDE, 'required': False, 'not_null': False, 'min': None, 'max': None, 'text_match': 'Test Rule 1', 'severity': Rule.SEVERITY_ERROR, 'units': "", } dq.add_rule(base_rule_info) rule = dq.rules.get() # Send invalid update request put_data = deepcopy(base_rule_info) put_data['text_match'] = None url = reverse('api:v3:data_quality_check-rules-detail', kwargs={ 'nested_organization_id': self.org.id, 'pk': rule.id }) res = self.client.put(url, content_type='application/json', data=json.dumps(put_data)) self.assertEqual(res.status_code, 400) self.assertTrue('Rule must not include or exclude an empty string. ' in json.loads(res.content)['message']) # Remove text_match and make condition NOT_NULL, then try making condition EXCLUDE rule.text_match = None rule.condition = Rule.RULE_NOT_NULL rule.save() put_data_2 = deepcopy(base_rule_info) del put_data_2['text_match'] # don't update text_match put_data_2['condition'] = Rule.RULE_EXCLUDE url = reverse('api:v3:data_quality_check-rules-detail', kwargs={ 'nested_organization_id': self.org.id, 'pk': dq.rules.get().id }) res = self.client.put(url, content_type='application/json', data=json.dumps(put_data_2)) self.assertEqual(res.status_code, 400) self.assertTrue('Rule must not include or exclude an empty string. ' in json.loads(res.content)['message'])
def reset(self, request, nested_organization_id=None): """ Resets an organization's data data_quality rules """ # TODO: Refactor to get all the rules for a DataQualityCheck object directly. # At that point, nested_organization_id should be changed to data_quality_check_id dq = DataQualityCheck.retrieve(nested_organization_id) dq.remove_all_rules() return self.list(request, nested_organization_id)
def test_tax_lot_state_quality(self): # Import the file and run mapping qs = TaxLotState.objects.filter( import_file=self.import_file).iterator() d = DataQualityCheck.retrieve(self.org) d.check_data('TaxLotState', qs) # import json # print json.dumps(d.results, indent=2) self.assertEqual(len(d.results), 4)
def test_remove_all_rules(self): dq = DataQualityCheck.retrieve(self.org) count = Rule.objects.filter(data_quality_check_id=dq.pk).count() self.assertEqual(count, len(DEFAULT_RULES)) dq.remove_all_rules() self.assertEqual(dq.rules.count(), 0) # ensure that the database has no rules for this dq associated with it count = Rule.objects.filter(data_quality_check_id=dq.pk).count() self.assertEqual(count, 0)
def results_csv(self, request): """ Download a CSV of the results from a data quality run based on either the ID that was given during the creation of the data quality task or the ID of the import file which had it's records checked. Note that it is not related to objects in the database, since the results are stored in redis! """ run_id = request.query_params.get('run_id') if run_id is None: return JsonResponse( { 'status': 'error', 'message': 'must include Import file ID or cache key as run_id' }, status=status.HTTP_400_BAD_REQUEST) data_quality_results = get_cache_raw( DataQualityCheck.cache_key(run_id)) response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="Data Quality Check Results.csv"' writer = csv.writer(response) if data_quality_results is None: writer.writerow(['Error']) writer.writerow(['data quality results not found']) return response writer.writerow([ 'Table', 'Address Line 1', 'PM Property ID', 'Tax Lot ID', 'Custom ID', 'Field', 'Applied Label', 'Condition', 'Error Message', 'Severity' ]) for row in data_quality_results: for result in row['data_quality_results']: writer.writerow([ row['data_quality_results'][0]['table_name'], row['address_line_1'], row['pm_property_id'] if 'pm_property_id' in row else None, row['jurisdiction_tax_lot_id'] if 'jurisdiction_tax_lot_id' in row else None, row['custom_id_1'], result['formatted_field'], result.get('label', None), result['condition'], # the detailed_message field can have units which has superscripts/subscripts, so unidecode it! unidecode(result['detailed_message']), result['severity'] ]) return response
def results(self, request): """ Return the results of a data quality run based on either the ID that was given during the creation of the data quality task or the ID of the import file which had it's records checked. Note that it is not related to objects in the database, since the results are stored in redis! """ data_quality_id = request.query_params['run_id'] data_quality_results = get_cache_raw( DataQualityCheck.cache_key(data_quality_id)) return JsonResponse({'data': data_quality_results})
def results(self, request): """ Return the result of the data quality based on the ID that was given during the creation of the data quality task. Note that it is not related to the object in the database, since the results are stored in redis! """ Organization.objects.get(pk=request.query_params['organization_id']) data_quality_id = request.query_params['data_quality_id'] data_quality_results = get_cache_raw( DataQualityCheck.cache_key(data_quality_id)) return JsonResponse({'data': data_quality_results})
def test_filter_rules(self): dq = DataQualityCheck.retrieve(self.org) rule_count = dq.rules.filter(enabled=True).count() # disable one of the rules rule = dq.rules.first() rule.enabled = False rule.save() rules = dq.rules.filter(enabled=True) self.assertEqual(rules.count(), rule_count - 1)
def test_add_custom_rule_exception(self): dq = DataQualityCheck.retrieve(self.org.id) dq.remove_all_rules() ex_rule = { 'table_name_does_not_exist': 'PropertyState', } with self.assertRaises(Exception) as exc: dq.add_rule(ex_rule) self.assertEqual( str(exc.exception), "Rule data is not defined correctly: 'table_name_does_not_exist' is an invalid keyword argument for this function" )
def csv(self, request, pk): """ Download a csv of the data quality checks by the pk which is the cache_key --- parameter_strategy: replace parameters: - name: pk description: Import file ID or cache key required: true paramType: path """ data_quality_results = get_cache_raw(DataQualityCheck.cache_key(pk)) response = HttpResponse(content_type='text/csv') response[ 'Content-Disposition'] = 'attachment; filename="Data Quality Check Results.csv"' writer = csv.writer(response) if data_quality_results is None: writer.writerow(['Error']) writer.writerow(['data quality results not found']) return response writer.writerow([ 'Table', 'Address Line 1', 'PM Property ID', 'Tax Lot ID', 'Custom ID', 'Field', 'Applied Label', 'Condition', 'Error Message', 'Severity' ]) for row in data_quality_results: for result in row['data_quality_results']: writer.writerow([ row['data_quality_results'][0]['table_name'], row['address_line_1'], row['pm_property_id'] if 'pm_property_id' in row else None, row['jurisdiction_tax_lot_id'] if 'jurisdiction_tax_lot_id' in row else None, row['custom_id_1'], result['formatted_field'], result.get('label', None), result['condition'], # the detailed_message field can have units which has superscripts/subscripts, so unidecode it! unidecode(result['detailed_message']), result['severity'] ]) return response
def test_add_custom_rule(self): dq = DataQualityCheck.retrieve(self.org.id) dq.remove_all_rules() ex_rule = { 'table_name': 'PropertyState', 'field': 'some_floor_area', 'data_type': Rule.TYPE_AREA, 'rule_type': Rule.RULE_TYPE_DEFAULT, 'min': 8760, 'max': 525600, 'severity': Rule.SEVERITY_ERROR, 'units': 'm**2', } dq.add_rule(ex_rule) self.assertEqual(dq.rules.count(), 1) self.assertDictContainsSubset(ex_rule, model_to_dict(dq.rules.first()))
def test_add_new_rule_and_reset(self): dq = DataQualityCheck.retrieve(self.org) new_rule = { 'table_name': 'PropertyState', 'field': 'conditioned_floor_area', 'data_type': TYPE_NUMBER, 'rule_type': RULE_TYPE_DEFAULT, 'min': 0, 'max': 7000000, 'severity': SEVERITY_ERROR, 'units': 'square feet' } dq.add_rule(new_rule) self.assertEqual(dq.rules.count(), len(DEFAULT_RULES) + 1) dq.reset_all_rules() self.assertEqual(dq.rules.count(), len(DEFAULT_RULES))
def test_default_create(self): dq = DataQualityCheck.retrieve(self.org.id) self.assertEqual(dq.rules.count(), 22) # Example rule to check ex_rule = { 'table_name': 'PropertyState', 'field': 'conditioned_floor_area', 'data_type': Rule.TYPE_AREA, 'rule_type': Rule.RULE_TYPE_DEFAULT, 'min': 0, 'max': 7000000, 'severity': Rule.SEVERITY_ERROR, 'units': 'ft**2', } rule = Rule.objects.filter(table_name='PropertyState', field='conditioned_floor_area', severity=Rule.SEVERITY_ERROR) self.assertDictContainsSubset(ex_rule, model_to_dict(rule.first()))
def test_update_rule_status_label_validation(self): # Start with 1 Rule dq = DataQualityCheck.retrieve(self.org.id) dq.remove_all_rules() base_rule_info = { 'field': 'address_line_1', 'table_name': 'PropertyState', 'enabled': True, 'data_type': Rule.TYPE_STRING, 'rule_type': Rule.RULE_TYPE_DEFAULT, 'condition': Rule.RULE_INCLUDE, 'required': False, 'not_null': False, 'min': None, 'max': None, 'text_match': 'Test Rule 1', 'severity': Rule.SEVERITY_ERROR, 'units': "", } dq.add_rule(base_rule_info) rule = dq.rules.get() # Send invalid update request that includes a label id from another org new_org, _, _ = create_organization(self.user, "test-organization-a") wrong_org_label_id = new_org.labels.first().id put_data = deepcopy(base_rule_info) put_data['status_label'] = wrong_org_label_id url = reverse('api:v3:data_quality_check-rules-detail', kwargs={ 'nested_organization_id': self.org.id, 'pk': rule.id }) res = self.client.put(url, content_type='application/json', data=json.dumps(put_data)) self.assertEqual(res.status_code, 400) self.assertTrue( f'Label with ID {wrong_org_label_id} not found in organization, {self.org.name}.' in json.loads(res.content)['status_label'])
def test_reset_rules(self): # Start with 1 Rule dq = DataQualityCheck.retrieve(self.org.id) dq.remove_all_rules() base_rule_info = { 'field': 'address_line_1', 'table_name': 'PropertyState', 'enabled': True, 'data_type': Rule.TYPE_STRING, 'rule_type': Rule.RULE_TYPE_DEFAULT, 'condition': Rule.RULE_INCLUDE, 'required': False, 'not_null': False, 'min': None, 'max': None, 'text_match': 'Test Rule 1', 'severity': Rule.SEVERITY_ERROR, 'units': "", } dq.add_rule(base_rule_info) url = reverse('api:v3:data_quality_check-rules-reset', kwargs={'nested_organization_id': self.org.id}) response = self.client.put(url) rules = json.loads(response.content) self.assertEqual(len(rules), 22) property_count = 0 taxlot_count = 0 for r in rules: if r['table_name'] == 'PropertyState': property_count += 1 elif r['table_name'] == 'TaxLotState': taxlot_count += 1 self.assertEqual(taxlot_count, 2) self.assertEqual(property_count, 20)
def test_text_match(self): dq = DataQualityCheck.retrieve(self.org.id) dq.remove_all_rules() new_rule = { 'table_name': 'PropertyState', 'field': 'address_line_1', 'data_type': Rule.TYPE_STRING, 'rule_type': Rule.RULE_TYPE_DEFAULT, 'severity': Rule.SEVERITY_ERROR, 'not_null': True, 'text_match': 742, } dq.add_rule(new_rule) ps_data = { 'no_default_data': True, 'custom_id_1': 'abcd', 'address_line_1': '742 Evergreen Terrace', 'pm_property_id': 'PMID', 'site_eui': 525600, } ps = self.property_state_factory.get_property_state(None, **ps_data) dq.check_data(ps.__class__.__name__, [ps]) self.assertEqual(dq.results, {})
def test_check_property_state_example_data(self): dq = DataQualityCheck.retrieve(self.org.id) ps_data = { 'no_default_data': True, 'custom_id_1': 'abcd', 'address_line_1': '742 Evergreen Terrace', 'pm_property_id': 'PMID', 'site_eui': 525600, } ps = self.property_state_factory.get_property_state(None, **ps_data) dq.check_data(ps.__class__.__name__, [ps]) # { # 11: { # 'id': 11, # 'custom_id_1': 'abcd', # 'pm_property_id': 'PMID', # 'address_line_1': '742 Evergreen Terrace', # 'data_quality_results': [ # { # 'severity': 'error', 'value': '525600', 'field': 'site_eui', 'table_name': 'PropertyState', 'message': 'Site EUI out of range', 'detailed_message': 'Site EUI [525600] > 1000', 'formatted_field': 'Site EUI' # } # ] # } error_found = False for index, row in dq.results.items(): self.assertEqual(row['custom_id_1'], 'abcd') self.assertEqual(row['pm_property_id'], 'PMID') self.assertEqual(row['address_line_1'], '742 Evergreen Terrace') for violation in row['data_quality_results']: if violation['message'] == 'Site EUI out of range': error_found = True self.assertEqual(violation['detailed_message'], 'Site EUI [525600] > 1000') self.assertEqual(error_found, True)