def test_difference_when_diffs_i_vs_h(self):
     irods_metadata = IrodsSeqFileMetadata('/seq/123.bam',
                                           samples={
                                               'name': set(['S1']),
                                               'accession_number':
                                               set(['EGA1']),
                                               'internal_id': set(['1'])
                                           },
                                           libraries={
                                               'name': set(['123']),
                                               'internal_id': set(['123'])
                                           },
                                           studies={
                                               'name':
                                               set(["Crohns disease"]),
                                               'accession_number':
                                               set(['EGAS4']),
                                               'internal_id':
                                               set(['4'])
                                           })
     header_metadata = SAMFileHeaderMetadata(
         '/seq/123.bam',
         samples={
             'name': set(['S100']),
             'accession_number': set(),
             'internal_id': set()
         },
         libraries={'internal_id': set(['123'])},
         studies={})
     result = irods_metadata.difference(header_metadata)
     self.assertDictEqual(result, {'samples': {'name': set(['S1'])}})
 def test_difference_when_no_diffs_i_vs_h(self):
     irods_metadata = IrodsSeqFileMetadata('/seq/123.bam',
                                           samples={'name': set(['S1']), 'accession_number': set(), 'internal_id': set()},
                                           libraries={}, studies={})
     header_metadata = SAMFileHeaderMetadata('/seq/123.bam',
                                             samples={'name': set(['S1']), 'accession_number' : set(), 'internal_id': set()},
                                             libraries={}, studies={})
     result = irods_metadata.difference(header_metadata)
     self.assertDictEqual(result, {})
Example #3
0
 def test_validate_fields_2(self):
     irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.bam')
     result = irods_metadata.validate_fields()
     self.assertEqual(len(result), 5)
     for check_res in result:
         if check_res.check_name == CHECK_NAMES.check_by_comparison_checksum_in_meta_with_checksum_at_upload:
             self.assertEqual(check_res.result, None)
         else:
             self.assertEqual(check_res.result, RESULT.FAILURE)
Example #4
0
 def test_validate_fields_when_wrong_npg_qc(self):
     irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.bam', npg_qc='aaAAA',
                                           checksum_at_upload='123abc', checksum_in_meta='123abc')
     result = irods_metadata.validate_fields()
     self.assertEqual(len(result), 5)
     for check_res in result:
         if check_res.check_name in [CHECK_NAMES.check_target_field, CHECK_NAMES.check_npg_qc_field]:
             self.assertEqual(check_res.result, RESULT.FAILURE)
         else:
             self.assertEqual(check_res.result, RESULT.SUCCESS)
 def test_difference_when_diffs_i_vs_h(self):
     irods_metadata = IrodsSeqFileMetadata('/seq/123.bam',
                                           samples={'name': set(['S1']), 'accession_number': set(['EGA1']), 'internal_id': set(['1'])},
                                           libraries={'name': set(['123']), 'internal_id': set(['123'])},
                                           studies={'name': set(["Crohns disease"]), 'accession_number': set(['EGAS4']), 'internal_id': set(['4'])})
     header_metadata = SAMFileHeaderMetadata('/seq/123.bam',
                                             samples={'name': set(['S100']), 'accession_number' : set(), 'internal_id': set()},
                                             libraries={'internal_id': set(['123'])}, studies={})
     result = irods_metadata.difference(header_metadata)
     self.assertDictEqual(result, {'samples': {'name': set(['S1'])}})
 def fetch_and_preprocess_irods_metadata_by_metadata(
         search_criteria, irods_zone, issues_dict, reference):
     """
     This function takes some filtering/matching criteria for selecting data from iRODS based on metadata.
     The client also passes an issues_dict to this function as parameter, which the current function just needs to
     update with the issues found on the files found in iRODS to match the criteria.
     :param issues_dict: an existing dictionary of issues, to which this function needs to add the issues found
     :param irods_zone: the irods zone where to search for the data matching the criteria given
     :param search_criteria: a dict formed of key= attr name, val = attr value. The operator is by default =.
     :return: a dict of key: fpath, value: the iRODS metadata for that path
     """
     irods_metadata_by_path = {}
     try:
         all_files_metadata_objs_list = iRODSMetadataProvider.retrieve_raw_files_metadata_by_metadata(
             search_criteria, irods_zone)
     except Exception as e:
         print(e)
         sys.exit(1)
     else:
         for raw_metadata in all_files_metadata_objs_list:
             check_results = []
             file_metadata = IrodsSeqFileMetadata.from_raw_metadata(
                 raw_metadata)
             check_results.extend(file_metadata.check_metadata(reference))
             irods_metadata_by_path[raw_metadata.fpath] = file_metadata
             issues_dict[raw_metadata.fpath].extend(check_results)
     return irods_metadata_by_path
Example #7
0
def check_metadata_given_as_json_stream(reference=None):
    """
    This function takes in the iRODS metadata as a stream of json data read from stdin and it uses for checking the files.
    :param reference: string that contains the name of the genome reference =>
                      one wants to check if the data has this reference as metadata
    :return: dict of key = string file path, value = list[CheckResult]
    """
    check_results_by_path = defaultdict(list)
    json_input_data = sys.stdin.read()
    baton_data_objects_list = convert_json_to_baton_objs(json_input_data)
    irods_metadata_dict = {}
    for data_obj in baton_data_objects_list:
        meta = IrodsSeqFileMetadata.from_baton_wrapper(data_obj)
        check_results_by_path[meta.fpath].extend(
            meta.check_metadata(reference))
        irods_metadata_dict[meta.fpath] = meta
    if not irods_metadata_dict:
        print("No irods metadata found. No checks performed.")
        sys.exit(1)
    header_metadata_dict = MetadataSelfChecks.fetch_and_preprocess_header_metadata(
        irods_metadata_dict.keys(), check_results_by_path)
    seqscape_metadata_dict = MetadataSelfChecks.fetch_and_preprocess_seqscape_metadata(
        irods_metadata_dict, check_results_by_path)
    FileMetadataComparison.check_metadata_across_different_sources(
        irods_metadata_dict, header_metadata_dict, seqscape_metadata_dict,
        check_results_by_path)
    return check_results_by_path
 def fetch_and_preprocess_irods_metadata_by_path(irods_fpaths, issues_dict,
                                                 reference):
     """
     This function fetches the irods metadata by file path and preprocesses it.
     It also adds the issues found to the issues_dict given as parameter.
     :param irods_fpaths:
     :param issues_dict:
     :param reference:
     :return:
     """
     irods_metadata_dict = defaultdict(list)
     for fpath in irods_fpaths:
         try:
             raw_metadata = iRODSMetadataProvider.fetch_raw_file_metadata_by_path(
                 fpath)
         except Exception as e:
             print(e)
             sys.exit(1)
         else:
             check_results = []
             file_metadata = IrodsSeqFileMetadata.from_raw_metadata(
                 raw_metadata)
             check_results.extend(file_metadata.check_metadata(reference))
             irods_metadata_dict[fpath] = file_metadata
             issues_dict[fpath].extend(check_results)
     return irods_metadata_dict
 def test_difference_when_not_the_right_type(self):
     irods_metadata = IrodsSeqFileMetadata('/seq/123.bam',
                                           samples={
                                               'name': set(['S1']),
                                               'accession_number': set(),
                                               'internal_id': set()
                                           },
                                           libraries={},
                                           studies={})
     self.assertRaises(TypeError, irods_metadata.difference, [1, 2, 3])
 def test_difference_when_no_diffs_i_vs_h(self):
     irods_metadata = IrodsSeqFileMetadata('/seq/123.bam',
                                           samples={
                                               'name': set(['S1']),
                                               'accession_number': set(),
                                               'internal_id': set()
                                           },
                                           libraries={},
                                           studies={})
     header_metadata = SAMFileHeaderMetadata('/seq/123.bam',
                                             samples={
                                                 'name': set(['S1']),
                                                 'accession_number': set(),
                                                 'internal_id': set()
                                             },
                                             libraries={},
                                             studies={})
     result = irods_metadata.difference(header_metadata)
     self.assertDictEqual(result, {})
Example #11
0
 def test_from_raw_metadata_only_replicas(self):
     replicas = [
         baton_models.DataObjectReplica(number=1, checksum="123abc"),
         baton_models.DataObjectReplica(number=2, checksum="abc"),]
     raw_metadata = IrodsRawFileMetadata(fpath='/seq/123.bam', file_replicas=replicas)
     seq_metadata = IrodsSeqFileMetadata.from_raw_metadata(raw_metadata)
     expected = {'name': set(), 'accession_number': set(), 'internal_id': set()}
     self.assertEqual(seq_metadata.samples, expected)
     self.assertEqual(seq_metadata.libraries, expected)
     self.assertEqual(seq_metadata.checksum_in_meta, set())
Example #12
0
def convert_data_object(data_object: DataObject) -> IrodsSeqFileMetadata:
    """
    Parses the given data object from iRODS into the representation used internally.
    :param data_object: data object from iRODS, retrieved via baton wrapper
    :return: internal representation of iRODS metadata
    """
    path = data_object.path

    if data_object.replicas is not None:
        # Assuming that replica number `IRODS_REPLICA_FIRST_NUMBER` is the first replica that is created
        original_replica = data_object.replicas.get_by_number(
            IRODS_ORIGINAL_REPLICA_NUMBER)
        checksum_at_upload = original_replica.checksum if original_replica is not None else None
    else:
        checksum_at_upload = None

    metadata = data_object.metadata
    if metadata is None:
        return IrodsSeqFileMetadata(path,
                                    checksum_at_upload=checksum_at_upload)

    references = metadata.get(IRODS_METADATA_REFERENCE_PROPERTY)
    target = list(metadata.get(IRODS_METADATA_TARGET_PROPERTY,
                               default={None}))[0]
    # TODO: Add other conversions

    if IRODS_METADATA_LIBRARY_ID_PROPERTY in metadata:
        libraries = metadata[IRODS_METADATA_LIBRARY_ID_PROPERTY]
    elif IRODS_METADATA_LEGACY_LIBRARY_ID_PROPERTY in metadata:
        libraries = metadata[IRODS_METADATA_LEGACY_LIBRARY_ID_PROPERTY]
    else:
        libraries = None

    return IrodsSeqFileMetadata(path,
                                references=references,
                                libraries=libraries,
                                checksum_at_upload=checksum_at_upload,
                                target=target)
Example #13
0
    def test_mdata_from_diff_srcs_when_different_id_types(self):
        irods_metadata = IrodsSeqFileMetadata('/seq/123.bam',
                                              samples={'name': set(['S1']), 'accession_number': set(['EGA1']),
                                                       'internal_id': set()},
                                              libraries={}, studies={})
        header_metadata = SAMFileHeaderMetadata('/seq/123.bam', samples={'name': set(['S1'])}, libraries={},
                                                studies={})
        seqscape_metadata = SeqscapeMetadata(samples={'name': set(['S1'])}, libraries={}, studies={})
        issues_dict = defaultdict(list)
        FileMetadataComparison.check_metadata_across_different_sources({'/seq/213.bam': irods_metadata},
                                                                       {'/seq/213.bam': header_metadata},
                                                                       {'/seq/213.bam': seqscape_metadata},
                                                                       issues_dict)
        check_results = issues_dict['/seq/213.bam']
        self.assertEqual(4, len(check_results))

        results = {c.result for c in check_results}
        self.assertSetEqual(results, {RESULT.SUCCESS})
Example #14
0
def check_metadata_given_as_json_stream(reference=None):
    """
    This function takes in the iRODS metadata as a stream of json data read from stdin and it uses for checking the files.
    :param reference: string that contains the name of the genome reference =>
                      one wants to check if the data has this reference as metadata
    :return: dict of key = string file path, value = list[CheckResult]
    """
    check_results_by_path = defaultdict(list)
    json_input_data = sys.stdin.read()
    baton_data_objects_list = convert_json_to_baton_objs(json_input_data)
    irods_metadata_dict = {}
    for data_obj in baton_data_objects_list:
        meta = IrodsSeqFileMetadata.from_baton_wrapper(data_obj)
        check_results_by_path[meta.fpath].extend(meta.check_metadata(reference))
        irods_metadata_dict[meta.fpath] = meta
    if not irods_metadata_dict:
        print("No irods metadata found. No checks performed.")
        sys.exit(1)
    header_metadata_dict = MetadataSelfChecks.fetch_and_preprocess_header_metadata(irods_metadata_dict.keys(), check_results_by_path)
    seqscape_metadata_dict = MetadataSelfChecks.fetch_and_preprocess_seqscape_metadata(irods_metadata_dict, check_results_by_path)
    FileMetadataComparison.check_metadata_across_different_sources(irods_metadata_dict, header_metadata_dict,
                                                                   seqscape_metadata_dict, check_results_by_path)
    return check_results_by_path
Example #15
0
 def test_is_npg_qc_valid_4(self):
     npq_qc = "0"
     result = IrodsSeqFileMetadata._is_npg_qc_valid(npq_qc)
     self.assertTrue(result)
Example #16
0
 def test_is_npg_qc_valid_6(self):
     npq_qc = "mamba"
     result = IrodsSeqFileMetadata._is_npg_qc_valid(npq_qc)
     self.assertFalse(result)
Example #17
0
 def test_extract_reference_name_from_ref_path3(self):
     ref_path = '/lustre/scratch109/srpipe/references/Homo_sapiens/GRCh38_15/all/bwa0_6/Homo_sapiens.GRCh38_15.fa'
     result = IrodsSeqFileMetadata.extract_reference_name_from_ref_path(ref_path)
     self.assertEqual(result, 'Homo_sapiens.GRCh38_15')
Example #18
0
 def test_extract_reference_name_from_ref_path2(self):
     ref_path = '/lustre/scratch110/srpipe/references/Homo_sapiens/1000Genomes/all/bwa/human_g1k_v37.fasta'
     result = IrodsSeqFileMetadata.extract_reference_name_from_ref_path(ref_path)
     self.assertEqual(result, 'human_g1k_v37')
Example #19
0
 def test_check_reference_when_ok(self):
     irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.cram',
                                           references=['/lustre/hs37d5.fa'])
     result = irods_metadata.check_reference('hs37d5')
     self.assertEqual(result.result, RESULT.SUCCESS)
Example #20
0
 def test_is_target_valid_when_empty(self):
     result = IrodsSeqFileMetadata._is_target_valid('')
     self.assertFalse(result)
Example #21
0
 def test_is_target_valid_when_invalid(self):
     result = IrodsSeqFileMetadata._is_target_valid('somethingelse')
     self.assertFalse(result)
Example #22
0
 def test_is_target_valid_when_valid_library(self):
     result = IrodsSeqFileMetadata._is_target_valid('library')
     self.assertTrue(result)
Example #23
0
 def test_check_checksum_in_meta_present(self):
     irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.bam', checksum_in_meta='aaa')
     result = irods_metadata.check_checksum_in_meta_present()
     self.assertEqual(result.result, RESULT.SUCCESS)
Example #24
0
 def test_extract_reference_name_from_ref_path4(self):
     ref_path = '/lustre/scratch110/srpipe/references/Homo_sapiens/GRCh37_53/all/bwa/Homo_sapiens.GRCh37.dna.all.fa'
     result = IrodsSeqFileMetadata.extract_reference_name_from_ref_path(ref_path)
     self.assertEqual(result, 'Homo_sapiens.GRCh37.dna.all')
Example #25
0
 def test_check_checksum_in_meta_present_when_absent(self):
     irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.bam')
     result = irods_metadata.check_checksum_in_meta_present()
     self.assertEqual(result.result, RESULT.FAILURE)
Example #26
0
 def test_check_reference_2(self):
     irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.cram',
                                           references=['/lustre/hs37d5.fa'])
     result = irods_metadata.check_reference('')
     self.assertEqual(result.result, None)
     self.assertEqual(result.executed, False)
Example #27
0
 def test_is_npg_qc_valid_7(self):
     npq_qc = True
     self.assertFalse(IrodsSeqFileMetadata._is_npg_qc_valid(npq_qc))
Example #28
0
 def test_check_reference_3(self):
     irods_metadata = IrodsSeqFileMetadata(fpath='/seq/1234/1234_5#6.cram')
     result = irods_metadata.check_reference('')
     self.assertEqual(result.result, None)
     self.assertEqual(result.executed, False)
Example #29
0
 def test_extract_reference_name_from_ref_path1(self):
     ref_path = '/lustre/scratch109/srpipe/references/Homo_sapiens/1000Genomes_hs37d5/all/bwa/hs37d5.fa'
     result = IrodsSeqFileMetadata.extract_reference_name_from_ref_path(ref_path)
     self.assertEqual(result, 'hs37d5')