def organism_part_sex_check(sample: Dict, animal: Dict, existing_results: VRR) -> VRR: """ Context validation to check organism part matches sex, i.e. s***n only from male animal For annotated with unknown sex, a Warning will be raised :param sample: the sample record :param animal: the derived from animal record :param existing_results: the existing validation result :return: the updated validation result """ sex: str = animal['attributes']['Sex'][0]['value'] organism_part_ontology = misc.extract_ontology_id_from_iri( sample['attributes']['Organism part'][0]['terms'][0]['url']) if organism_part_ontology == 'UBERON_0001968': #s***n if sex.lower() == "female": existing_results.add_validation_result_column( VRC( VRConstant.ERROR, "Organism part (S***n) could not be taken from a female animal", existing_results.record_id, "organism part", VRConstant.CONTEXT)) # the third sex opiton is 'record of unknown sex' elif 'unknown sex' in sex.lower(): existing_results.add_validation_result_column( VRC( VRConstant.WARNING, "Organism part (S***n) is expected to be taken from a male animal, " "please check the sex value (record of unknown sex) is correct", existing_results.record_id, "organism part", VRConstant.CONTEXT)) return existing_results
def validate(self, record: Dict, id_field: str = 'Data source ID') -> VRR: """ Validate the record with the full ruleset :param record: the record data :param id_field: the name of the id field, in IMAGE ruleset it is Data source ID :return: the validation result """ logger.debug(f"got record: {record}, id_field: {id_field}") attributes = record['attributes'] record_id = attributes[id_field][0]['value'] record_result = VRR(record_id) unmapped = attributes.copy( ) # create a copy and remove the ruleset-mapped columns del unmapped[id_field] for section_name in self.get_all_section_names(): logger.debug(f"Processing section_name: {section_name}") section_rule = self.get_section_by_name(section_name) # logger.debug("Got section_rule: %s" % (section_rule.toJSON())) if section_rule.meet_condition(record): logger.debug("Applying " + section_name + " ruleset to record " + record_id) section_results = section_rule.validate( attributes, record_id, id_field) for one in section_results: record_result.add_validation_result_column(one) for field_name in section_rule.get_rule_names(): if field_name in unmapped: del unmapped[field_name] else: logger.debug("section_rule %s doesn't meet_condition" % section_name) # unmapped column check can only be done here, not in section rule # validation as all section rules need to apply if unmapped: logger.debug("found those unmapped keys: %s" % (unmapped.keys())) for key in unmapped.keys(): record_result.add_validation_result_column( VRC(VRConstants.WARNING, f"Column {key} could not be found in ruleset", record_id, key)) else: logger.debug("No unmapped columns left") return record_result
def species_check(record: Dict, existing_results: VRR) -> VRR: """ Context validation to check when species specified in the USI structure matches the species field :param record: the record data :param existing_results: the existing validation result :return: the updated validation result """ taxon_id = record['taxonId'] url = record['attributes'][SPECIES][0]['terms'][0]['url'] if not url.endswith(str(taxon_id)): existing_results.add_validation_result_column( VRC( VRConstant.ERROR, f"taxonId {taxon_id} does not match ontology term used in species {url}", existing_results.record_id, "taxonomy", VRConstant.CONTEXT)) return existing_results
def test_sample_relationship_issue(self): """Testing an error with related alias. Not sure if it can happen or not""" # get record from sample record = self.sample_record # change alias in relationship in order to have no a related obj record["sampleRelationships"] = [{ "alias": "IMAGEA999999999", "relationshipNature": "derived from" }] # create a fake ValidationResultRecord record_result = ValidationResultRecord(record_id=record['title']) # check relationship method related, result = self.metadata.check_relationship( record, record_result) # this is an error in results self.assertEqual(related, []) self.assertEqual(result.get_overall_status(), 'Error') self.assertIn("Could not locate the referenced record", result.get_messages()[0])
def check_value_equal(source: Dict, target: Dict, existing_results: VRR, field: str) -> VRR: target_field_value = target['attributes'][field][0]['value'] source_field_value = source['attributes'][field][0]['value'] source_label = 'sample' target_label = 'related animal' if source['attributes']['Material'][0]['value'] == 'organism': source_label = 'child' target_label = 'parent' if target_field_value != source_field_value: record_id = existing_results.record_id existing_results.add_validation_result_column( VRC( VRConstant.ERROR, f"The {field} of {source_label} ({source_field_value}) does not " f"match to the {field} of {target_label} ({target_field_value})", record_id, field, VRConstant.CONTEXT)) return existing_results
def context_validation(record: Dict, existing_results: VRR, related: List = None) -> VRR: """ do validation based on context, i.e. value in one field affects allowed values in another field or involve more than one record :param record: the record data :param existing_results: the existing validation result :param related: list of the related records either parents or related animal, could be empty list :return: updated validation result """ existing_results = coordinate_check(record['attributes'], existing_results) existing_results = species_check(record, existing_results) record_id = existing_results.record_id # existing related records, i.e. having relationships if related: material = record['attributes']['Material'][0]['value'] if material == "organism": if len(related) > 2: existing_results.add_validation_result_column( VRC( VRConstant.ERROR, "Having more than 2 parents defined in sampleRelationships", existing_results.record_id, "sampleRelationships", VRConstant.CONTEXT)) else: existing_results = child_of_check(record, related, existing_results) if len(related) == 2: existing_results = parents_sex_check( related, existing_results) else: if len(related) != 1: existing_results.add_validation_result_column( VRC(VRConstant.ERROR, "Specimen can only derive from one animal", record_id, "sampleRelationships", VRConstant.CONTEXT)) else: existing_results = animal_sample_check(record, related[0], existing_results) return existing_results
def species_breed_check(animal: Dict, existing_results: VRR) -> VRR: """ check whether mapped breed (recommended) matches species if mapped breed not found, gives a warning saying no check has been carried out on supplied breed (mandatory) :param animal: the animal record to be validated :param existing_results: the existing validation result :return: the updated validation result """ attrs = animal['attributes'] # get root breed ontology term based on given species species = attrs[SPECIES][0]['value'] general_breed_from_species: str = use_ontology.get_general_breed_by_species( species) general_breed_term = general_breed_from_species['ontologyTerms'].rsplit( "/", 1)[1] if 'Mapped breed' in attrs: mapped_breed = attrs['Mapped breed'][0]['terms'][0]['url'] match = static_parameters.ontology_library.has_parent( mapped_breed, general_breed_term) if not match: general_crossbreed_from_species = use_ontology.get_general_breed_by_species( species, cross=True) general_crossbreed_term = general_crossbreed_from_species[ 'ontologyTerms'].rsplit("/", 1)[1] match = static_parameters.ontology_library.has_parent( mapped_breed, general_crossbreed_term) if not match: existing_results.add_validation_result_column( VRC( VRConstant.ERROR, f"The mapped breed {mapped_breed} does not match the given species {species}", existing_results.record_id, "Mapped breed", VRConstant.CONTEXT)) else: existing_results.add_validation_result_column( VRC( VRConstant.WARNING, f"No check has been carried out on whether " f"{attrs['Supplied breed'][0]['value']} is a {species} breed as no mapped breed provided", existing_results.record_id, "Supplied breed", VRConstant.CONTEXT)) return existing_results
def load_ruleset(self, ruleset_file: str) -> VRR: """ Load the ruleset from the JSON file and check the integrity of the ruleset, if successful, set ruleset ready flag if not, the results are stored in the class field general_errors :param ruleset_file: the JSON file containing the ruleset """ self.ruleset_pass_flag = False general_errors = VRR("general") try: self.ruleset = validation.read_in_ruleset(ruleset_file) except KeyError as e: general_errors.add_validation_result_column( VRC(VRConstants.ERROR, str(e), general_errors.record_id, "", VRConstants.GENERAL)) return general_errors ruleset_check_result: VRR = validation.check_ruleset(self.ruleset) if ruleset_check_result.get_overall_status() != "Pass": return ruleset_check_result logger.info("Ruleset loaded") self.ruleset_pass_flag = True return general_errors
def parents_sex_check(related: List[Dict], existing_results: VRR) -> VRR: """ Context validation to check whether the two annotated parents have two different genders For annotated with unknown sex, a Warning will be raised :param related: the list of two parent animals :param existing_results: the existing validation result :return: the updated validation result """ one_sex: str = related[0]['attributes']['Sex'][0]['value'] another_sex: str = related[1]['attributes']['Sex'][0]['value'] unknown_flag = False if "unknown sex" in one_sex.lower() or "unknown sex" in another_sex.lower( ): unknown_flag = True existing_results.add_validation_result_column( VRC( VRConstant.WARNING, "At least one parent has unknown value for sex, thus could not be checked", existing_results.record_id, "parents sex", VRConstant.CONTEXT)) if not unknown_flag and one_sex == another_sex: existing_results.add_validation_result_column( VRC(VRConstant.ERROR, "Two parents could not have same sex", existing_results.record_id, "parents sex", VRConstant.CONTEXT)) return existing_results
def coordinate_check(record: Dict, existing_results: VRR) -> VRR: """ Context validation to check whether value in the place field matches to the value in the accuracy field :param record: the record data :param existing_results: the existing validation result :return: the updated validation result """ if type(record) is not dict: raise TypeError("record needs to be a record represented as a Dict") if type(existing_results) is not VRR: raise TypeError( "The existing results parameter needs to be a ValidationResultRecord object" ) material = record['Material'][0]['value'] if material == "organism": place_field_name = "Birth location" else: place_field_name = "Collection place" place_accuracy_field_name = place_field_name + " accuracy" if place_field_name not in record: if record[place_accuracy_field_name][0][ 'value'] != "missing geographic information": msg = f"No value provided for field {place_field_name} but value in field" \ f" {place_accuracy_field_name} is not missing geographic information" existing_results.add_validation_result_column( VRC(VRConstant.ERROR, msg, existing_results.record_id, place_field_name, VRConstant.CONTEXT)) else: if record[place_accuracy_field_name][0][ 'value'] == "missing geographic information": msg = f"Value {record[place_field_name][0]['value']} provided for field {place_field_name} " \ f"but value in field {place_accuracy_field_name} is missing geographic information" existing_results.add_validation_result_column( VRC(VRConstant.ERROR, msg, existing_results.record_id, place_field_name, VRConstant.CONTEXT)) return existing_results
def load_data(self, data_file: str, section: str = '') -> VRR: """ Load the data from JSON file which is to be validated and do preliminary validation (usi structure and duplicate), if successful set data ready flag The preliminary validation results are stored in the general_errors class field :param data_file: the JSON file contains the data :param section: optional, the name of the section which contains data """ self.data_ready_flag = False general_errors = VRR("general") try: with open(data_file) as infile: self.data = json.load(infile) except FileNotFoundError: msg = f"Could not find the file {data_file}" general_errors.add_validation_result_column( VRC(VRConstants.ERROR, msg, general_errors.record_id, "", VRConstants.GENERAL)) return general_errors except json.decoder.JSONDecodeError: msg = f"The provided file {data_file} is not a valid JSON file." general_errors.add_validation_result_column( VRC(VRConstants.ERROR, msg, general_errors.record_id, "", VRConstants.GENERAL)) return general_errors if len(section) > 0: if section in self.data: self.data = self.data[section] # check usi structure usi_check_result = validation.check_usi_structure(self.data) if usi_check_result.get_overall_status() != "Pass": return usi_check_result # check duplicate id msgs = validation.check_duplicates(self.data, self.id_field) if msgs: for msg in msgs: # classify the error as ruleset based error # as it is implicitly required that id field holds unique values general_errors.add_validation_result_column( VRC(VRConstants.ERROR, msg, general_errors.record_id, self.id_field, VRConstants.RELATIONSHIP)) return general_errors logger.info("All sample records have unique data source ids") self.data_ready_flag = True return general_errors
def check_biosample_id(self, mock_get, status_code): """Base method for checking biosample id""" # paching response response = Mock() response.status_code = status_code mock_get.return_value = response # create a fake ValidationResultRecord record_result = ValidationResultRecord(record_id="test") # get a metadata object metadata = MetaDataValidation() # check biosample object record_result = metadata.check_biosample_id_target( "FAKEA123456", "test", record_result) # assert my methods called self.assertTrue(self.check_ruleset.called) self.assertTrue(self.read_in_ruleset.called) self.assertTrue(mock_get.called) return record_result
def test_validate_submission_errors(self, my_validate, my_check): """A submission with errors is a NEED_REVISION submission""" # setting check_usi_structure result. now is a ValidateResultRecord result = PickableMock() result.get_overall_status.return_value = "Pass" result.get_messages.return_value = [] my_check.return_value = result # setting a return value for check_with_ruleset result1 = ValidationResultRecord("animal_1") result1.add_validation_result_column( ValidationResultColumn("warning", "warn message", "animal_1", "warn column")) result2 = ValidationResultRecord("animal_2") result2.add_validation_result_column( ValidationResultColumn("pass", "a message", "animal_2", "")) result3 = ValidationResultRecord("animal_3") result3.add_validation_result_column( ValidationResultColumn("pass", "a message", "animal_3", "")) result4 = ValidationResultRecord("sample_1") result4.add_validation_result_column( ValidationResultColumn("error", "error message", "sample_1", "error column")) # add results to result set responses = [result1, result2, result3, result4] my_validate.side_effect = responses # call task res = self.my_task.run(submission_id=self.submission_id) # assert a success with validation taks self.assertEqual(res, "success") # check submission status and message self.submission.refresh_from_db() # check submission.state changed self.assertEqual(self.submission.status, NEED_REVISION) self.assertIn("Error in metadata", self.submission.message) # check Animal (they are all ok) self.check_model_status(self.animal_qs, responses, READY) # sample has need revision self.check_model_status(self.sample_qs, responses[self.animal_qs.count():], NEED_REVISION) # test for my methods called self.assertTrue(my_check.called) self.assertTrue(my_validate.called) # asserting my mock objects self.assertTrue(self.read_in_ruleset.called) self.assertTrue(self.check_ruleset.called) self.assertFalse(self.validate_retry.called) self.check_message( message='Need Revision', notification_message=('Validation got errors: Error in ' 'metadata. Need revisions before submit'), validation_message={ 'animals': self.n_animals, 'samples': self.n_samples, 'animal_unkn': 0, 'sample_unkn': 0, 'animal_issues': 0, 'sample_issues': 1 }, pk=1)
def test_validate_submission_wrong_json(self, my_validate, my_check): """Test an error in JSON format""" # setting check_usi_structure result. now is a ValidateResultRecord messages = [('Wrong JSON structure: no title field for record with ' 'alias as animal_1'), ('Wrong JSON structure: the values for attribute Person ' 'role needs to be in an array for record animal_1')] usi_result = ValidationResultRecord("animal_1") usi_result.add_validation_result_column( ValidationResultColumn("error", messages[0], "animal_1", "")) usi_result.add_validation_result_column( ValidationResultColumn("error", messages[1], "animal_1", "")) # track 4 object to call check_model_status responses = [usi_result] * 4 my_check.side_effect = responses # setting a return value for check_with_ruleset rule_result = Mock() rule_result.get_overall_status.return_value = "Pass" my_validate.return_value = rule_result # call task res = self.my_task.run(submission_id=self.submission_id) # assert a success with validation taks self.assertEqual(res, "success") # check submission status and message self.submission.refresh_from_db() # check submission.state changed self.assertEqual(self.submission.status, NEED_REVISION) self.assertIn("Validation got errors", self.submission.message) # Animals and samples have issues self.check_model_status(self.animal_qs, responses, NEED_REVISION) # sample has need revision self.check_model_status(self.sample_qs, responses[self.animal_qs.count():], NEED_REVISION) # if JSON is not valid, I don't check for ruleset self.assertTrue(my_check.called) self.assertFalse(my_validate.called) # asserting my mock objects self.assertTrue(self.read_in_ruleset.called) self.assertTrue(self.check_ruleset.called) self.assertFalse(self.validate_retry.called) # all sample and animals have issues self.check_message( 'Need Revision', ('Validation got errors: Error in metadata. ' 'Need revisions before submit'), { 'animals': self.n_animals, 'samples': self.n_samples, 'animal_unkn': 0, 'sample_unkn': 0, 'animal_issues': self.n_animals, 'sample_issues': self.n_samples }, 1)