def validate_fields(self) -> List: check_results = [] upl_checksum_check = self.check_checksum_at_upload_present() check_results.append(upl_checksum_check) meta_checksum_check = self.check_checksum_in_meta_present() check_results.append(meta_checksum_check) comp_check = CheckResult( check_name=CHECK_NAMES. check_by_comparison_checksum_in_meta_with_checksum_at_upload) if upl_checksum_check.result == RESULT.SUCCESS and meta_checksum_check.result == RESULT.SUCCESS: if self.checksum_in_meta != self.checksum_at_upload: comp_check.result = RESULT.FAILURE comp_check.error_message = "The checksum in metadata = %s different than checksum at upload = %s" % \ (self.checksum_in_meta, self.checksum_at_upload) else: comp_check.result = RESULT.SUCCESS else: comp_check.executed = False comp_check.result = None check_results.append(comp_check) check_npg_qc = self.check_npg_qc_field() check_results.append(check_npg_qc) check_target_field = self.check_target_field() check_results.append(check_target_field) return check_results
def check_more_than_one_replicas(cls, replicas) -> CheckResult: check_result = CheckResult( check_name=CHECK_NAMES.check_more_than_one_replica, severity=SEVERITY.WARNING) if len(replicas) <= 1: check_result.executed = True check_result.result = RESULT.FAILURE check_result.error_message = "File has " + str( len(replicas)) + " replicas" return check_result
def checksum_comparison_check(self): check_result = CheckResult( check_name=CHECK_NAMES. check_by_comparison_checksum_in_meta_with_checksum_at_upload, error_message=[]) impossible_to_exec = False if not self.checksum_at_upload: check_result.executed = False check_result.error_message.append("Missing ichecksum result.") impossible_to_exec = True if not self.checksum_in_meta: check_result.executed = False check_result.error_message.append("Missing checksum from metadata") impossible_to_exec = True if not impossible_to_exec and self.checksum_in_meta != self.checksum_at_upload: check_result.result = RESULT.FAILURE check_result.error_message = "The checksum in metadata = %s different than checksum at upload = %s" % \ (self.checksum_in_meta, self.checksum_at_upload) return check_result
def check_reference(self, desired_ref_name: str) -> List[CheckResult]: check_result = CheckResult( check_name=CHECK_NAMES.check_desired_reference) check_result.error_message = [] if not self.get_references(): check_result.result = None check_result.executed = False check_result.error_message.append( "Missing reference from the metadata") if not desired_ref_name: check_result.result = None check_result.executed = False check_result.error_message.append( "Missing desired reference parameter") if not check_result.error_message: for ref in self.get_references(): if ref.lower().find(desired_ref_name.lower()) == -1: check_result.result = RESULT.FAILURE check_result.error_message = "The desired reference is: %s is different thant the metadata reference: %s" % ( desired_ref_name, ref) return check_result
def check_studies_fetched_by_samples(self): check_results = [] same_study_for_samples_check = CheckResult( check_name=CHECK_NAMES. check_studies_in_irods_with_studies_in_seqscape_fetched_by_samples) #check_for_samples_in_more_studies = CheckResult(check_name=CHECK_NAMES.check_for_samples_in_more_studies, severity=SEVERITY.WARNING) if not self.get_entities_by_type('sample'): same_study_for_samples_check.executed = False same_study_for_samples_check.result = None # check_for_samples_in_more_studies.executed = False # check_for_samples_in_more_studies.result = None # check_results.append(check_for_samples_in_more_studies) check_results.append(same_study_for_samples_check) return check_results studies_by_samples_set = set( self.get_all_entities_by_association_by_type('sample', 'study')) studies_set = set(self.get_entities_by_type('study')) studies_set_names = [study.name for study in studies_set] studies_by_samples_set_names = [ study.name for study in studies_by_samples_set ] sample_set_ids = [(sample.name, sample.accession_number) for sample in self.get_entities_by_type('sample')] if not studies_set.issubset(studies_by_samples_set): error_msg = "For the %s given seqscape samples, the studies in iRODS: %s and the studies in Seqscape DISAGREE: %s" % ( str(len(sample_set_ids)), studies_set_names, studies_by_samples_set_names) same_study_for_samples_check.result = RESULT.FAILURE same_study_for_samples_check.error_message = error_msg else: diff_wrong_studies_for_samples_in_irods = studies_set.difference( studies_by_samples_set) if diff_wrong_studies_for_samples_in_irods: error_msg = "Studies in Seqscape and in iRODS for %s samples don't agree. Studies in iRODS and not in Seqscape: %s" % ( str(len(sample_set_ids)), diff_wrong_studies_for_samples_in_irods) same_study_for_samples_check.result = RESULT.FAILURE same_study_for_samples_check.error_message = error_msg check_results.append(same_study_for_samples_check) # diff_sam_belongs2more_studies = studies_by_samples_set.difference(studies_set) # if diff_sam_belongs2more_studies: # error_msg = "Some samples belong to more than one study. For samples: %s we had these studies as metadata: %s and we found in Seqscape these studies: %s" % ( # sample_set_ids, # studies_set_names, # studies_by_samples_set_names) # check_for_samples_in_more_studies.result = RESULT.FAILURE # check_for_samples_in_more_studies.error_message = error_msg # check_results.append(check_for_samples_in_more_studies) return check_results
def check_samples_fetched_by_studies(self): check_result = CheckResult(check_name=CHECK_NAMES.check_samples_in_irods_same_as_samples_fetched_by_study_from_seqscape) #"Check if the sample ids in iRODS for a study belong to the same study in Sqeuencescape ") if not self.get_entities_by_type('study'): check_result.executed = False check_result.result = None return check_result samples_by_studies_set = set(self.get_all_entities_by_association_by_type('study', 'sample')) samples_set = set(self.get_entities_by_type('sample')) if not samples_set.issubset(samples_by_studies_set): diff_samples_wrong_study = samples_set.difference(samples_by_studies_set) error_msg = "Some samples don't appear under study(s): %s in Sequencescape, " \ "but they appear under this study in iRODS. Number of samples: %s, " \ "and ids: %s" % ([study.name for study in self.get_entities_by_type('study')], str(len(diff_samples_wrong_study)), [(s.name, s.accession_number) for s in diff_samples_wrong_study]) check_result.error_message = error_msg check_result.result = RESULT.FAILURE return check_result
def check_studies_fetched_by_samples(self): check_results = [] same_study_for_samples_check = CheckResult(check_name=CHECK_NAMES.check_studies_in_irods_with_studies_in_seqscape_fetched_by_samples) #check_for_samples_in_more_studies = CheckResult(check_name=CHECK_NAMES.check_for_samples_in_more_studies, severity=SEVERITY.WARNING) if not self.get_entities_by_type('sample'): same_study_for_samples_check.executed = False same_study_for_samples_check.result = None # check_for_samples_in_more_studies.executed = False # check_for_samples_in_more_studies.result = None # check_results.append(check_for_samples_in_more_studies) check_results.append(same_study_for_samples_check) return check_results studies_by_samples_set = set(self.get_all_entities_by_association_by_type('sample', 'study')) studies_set = set(self.get_entities_by_type('study')) studies_set_names = [study.name for study in studies_set] studies_by_samples_set_names = [study.name for study in studies_by_samples_set] sample_set_ids = [(sample.name, sample.accession_number) for sample in self.get_entities_by_type('sample')] if not studies_set.issubset(studies_by_samples_set): error_msg = "For the %s given seqscape samples, the studies in iRODS: %s and the studies in Seqscape DISAGREE: %s" % (str(len(sample_set_ids)), studies_set_names, studies_by_samples_set_names) same_study_for_samples_check.result = RESULT.FAILURE same_study_for_samples_check.error_message=error_msg else: diff_wrong_studies_for_samples_in_irods = studies_set.difference(studies_by_samples_set) if diff_wrong_studies_for_samples_in_irods: error_msg = "Studies in Seqscape and in iRODS for %s samples don't agree. Studies in iRODS and not in Seqscape: %s" % ( str(len(sample_set_ids)), diff_wrong_studies_for_samples_in_irods) same_study_for_samples_check.result = RESULT.FAILURE same_study_for_samples_check.error_message = error_msg check_results.append(same_study_for_samples_check) # diff_sam_belongs2more_studies = studies_by_samples_set.difference(studies_set) # if diff_sam_belongs2more_studies: # error_msg = "Some samples belong to more than one study. For samples: %s we had these studies as metadata: %s and we found in Seqscape these studies: %s" % ( # sample_set_ids, # studies_set_names, # studies_by_samples_set_names) # check_for_samples_in_more_studies.result = RESULT.FAILURE # check_for_samples_in_more_studies.error_message = error_msg # check_results.append(check_for_samples_in_more_studies) return check_results
def check_non_public_acls(cls, acls) -> List[CheckResult]: """ Checks that the iRODS object doesn't have associated an ACL giving public access to users to it. :param acls: :return: """ # problems = [] check_result = CheckResult( check_name=CHECK_NAMES.check_no_public_acl, severity=SEVERITY.WARNING) if not acls: check_result.result = None check_result.executed = False check_result.error_message = "There are no ACLs." return check_result for acl in acls: if acl.provides_public_access(): check_result.error_message = error_message = "The following ACL was found: " + str( acl) check_result.result = RESULT.FAILURE break return check_result
def check_all_replicas_have_same_checksum(cls, replicas) -> CheckResult: result = CheckResult( check_name=CHECK_NAMES.check_all_replicas_same_checksum, severity=SEVERITY.IMPORTANT) if not replicas: result.executed = False result.error_message = ["No replicas to compare with."] result.result = None return result first_replica = replicas[0] error_message = '' for replica in replicas: if not replica.checksum == first_replica.checksum: result.result = RESULT.FAILURE error_message += "Replica: " + str( replica ) + " has different checksum than replica: " + str( first_replica) if error_message: result.error_message = error_message return result
def check_samples_fetched_by_studies(self): check_result = CheckResult( check_name=CHECK_NAMES. check_samples_in_irods_same_as_samples_fetched_by_study_from_seqscape ) #"Check if the sample ids in iRODS for a study belong to the same study in Sqeuencescape ") if not self.get_entities_by_type('study'): check_result.executed = False check_result.result = None return check_result samples_by_studies_set = set( self.get_all_entities_by_association_by_type('study', 'sample')) samples_set = set(self.get_entities_by_type('sample')) if not samples_set.issubset(samples_by_studies_set): diff_samples_wrong_study = samples_set.difference( samples_by_studies_set) error_msg = "Some samples don't appear under study(s): %s in Sequencescape, " \ "but they appear under this study in iRODS. Number of samples: %s, " \ "and ids: %s" % ([study.name for study in self.get_entities_by_type('study')], str(len(diff_samples_wrong_study)), [(s.name, s.accession_number) for s in diff_samples_wrong_study]) check_result.error_message = error_msg check_result.result = RESULT.FAILURE return check_result
def check_metadata_across_different_sources(irods_metadata_dict, header_metadata_dict, seqsc_metadata_dict, issues_dict): """ This function checks the metadata from 3 different sources in terms of samples, libraries and studies. At the moment the checks across these sources consist of comparing: libraries, studies and samples As a result it updates the issues_dict by appending the CheckResults obtain after running the latest tests. :param irods_metadata_dict: key: fpath, value: irods_metadata for that file :param header_metadata_dict: key: fpath, value: header_metadata for that file :param seqsc_metadata_dict: key: fpath, value: seqscape_metadata for that file :param issues_dict: key: fpath, value: list of CheckResults :return: """ for fpath, irods_metadata in irods_metadata_dict.items(): header_metadata = header_metadata_dict.get(fpath) seqscape_metadata = seqsc_metadata_dict.get(fpath) ss_vs_h_check_result = CheckResult( check_name=CHECK_NAMES. check_seqscape_ids_compared_to_header_ids, error_message=[]) h_vs_ss_check_result = CheckResult( check_name=CHECK_NAMES. check_header_ids_compared_to_seqscape_ids, error_message=[]) i_vs_h_check_result = CheckResult( check_name=CHECK_NAMES.check_irods_ids_compared_to_header_ids, error_message=[]) h_vs_i_check_result = CheckResult( check_name=CHECK_NAMES.check_header_ids_compared_to_irods_ids, error_message=[]) if not header_metadata.has_metadata(): error_msg = "No header metadata" ss_vs_h_check_result.executed = False h_vs_ss_check_result.executed = False i_vs_h_check_result.executed = False h_vs_i_check_result.executed = False i_vs_h_check_result.result = None h_vs_i_check_result.result = None h_vs_ss_check_result.result = None ss_vs_h_check_result.result = None ss_vs_h_check_result.error_message.append(error_msg) h_vs_ss_check_result.error_message.append(error_msg) i_vs_h_check_result.error_message.append(error_msg) h_vs_i_check_result.error_message.append(error_msg) else: if not seqscape_metadata.has_metadata(): error_msg = "No seqscape metadata" ss_vs_h_check_result.executed = False h_vs_ss_check_result.executed = False ss_vs_h_check_result.result = None h_vs_ss_check_result.result = None ss_vs_h_check_result.error_message.append(error_msg) h_vs_ss_check_result.error_message.append(error_msg) else: seqscape_diff_header = seqscape_metadata.difference( header_metadata) header_diff_seqscape = header_metadata.difference( seqscape_metadata) if seqscape_diff_header: error_msg = "Differences: %s" % seqscape_diff_header ss_vs_h_check_result.error_message = error_msg ss_vs_h_check_result.result = RESULT.FAILURE if header_diff_seqscape: error_msg = "Differences: %s" % header_diff_seqscape h_vs_ss_check_result.result = RESULT.FAILURE h_vs_ss_check_result.error_message = error_msg if not irods_metadata.has_metadata(): error_msg = "No irods metadata" i_vs_h_check_result.executed = False h_vs_i_check_result.executed = False i_vs_h_check_result.result = None h_vs_i_check_result.result = None i_vs_h_check_result.error_message.append(error_msg) h_vs_i_check_result.error_message.append(error_msg) else: irods_diff_header = irods_metadata.difference( header_metadata) header_diff_irods = header_metadata.difference( irods_metadata) if irods_diff_header: error_msg = "Differences: %s" % irods_diff_header i_vs_h_check_result.error_message = error_msg i_vs_h_check_result.result = RESULT.FAILURE if header_diff_irods: error_msg = "Differences between what is in the header and not in iRODS: %s" % header_diff_irods h_vs_i_check_result.error_message = error_msg h_vs_i_check_result.result = RESULT.FAILURE issues_dict[fpath].append(ss_vs_h_check_result) issues_dict[fpath].append(h_vs_ss_check_result) issues_dict[fpath].append(i_vs_h_check_result) issues_dict[fpath].append(h_vs_i_check_result)