def assert_bundle(): bundle = Bundle(uuid, version, manifest, metadata_files) project = bundle.projects[UUID( '519b58ef-6462-4ed3-8c0d-375b54f53c31')] self.assertEqual(len(project.publications), 1) publication = project.publications.pop() title = 'Precursors of human CD4+ cytotoxic T lymphocytes identified by single-cell transcriptome analysis.' self.assertEqual(publication.title, title) # noinspection PyDeprecation self.assertEqual(publication.doi, '10.1126/sciimmunol.aan8664') self.assertEqual(publication.official_hca, None) self.assertEqual(publication.title, publication.publication_title) self.assertEqual( publication.url, 'http://immunology.sciencemag.org/content/3/19/eaan8664.long') # noinspection PyDeprecation self.assertEqual(publication.url, publication.publication_url) project_roles = {c.project_role for c in project.contributors} self.assertEqual( project_roles, {None, 'external curator', 'Human Cell Atlas wrangler'}) supplementary_links = { 'https://www.ebi.ac.uk/gxa/sc/experiments/E-GEOD-106540/Results' } self.assertEqual(project.supplementary_links, supplementary_links)
def test_missing_mandatory_checksums(self): uuid = '404f9663-21c6-49ff-afd0-8cfeff816949' checksums = [] cases = [{}, { 'crc32c': None }, { 'crc32c': 'a' }, { 'crc32c': 'a', 'sha1': None }] for case in cases: with self.assertRaises(TypeError) as cm: Bundle(uuid=uuid, version='', manifest=[{ 'uuid': uuid, 'version': '', 'name': '', 'size': 0, 'indexed': True, 'content-type': '', **case }], metadata_files={}) self.assertEqual(cm.exception.args[0], 'Property cannot be absent or None') checksums.append(cm.exception.args[1]) self.assertEqual(['crc32c', 'crc32c', 'sha256', 'sha256'], checksums)
def assert_bundle(): bundle = Bundle(uuid, version, manifest, metadata_files) self.assertEqual(len(bundle.files), 6) for file in bundle.files.values(): if isinstance(file, SequenceFile): self.assertEqual(file.format, 'fastq.gz') if isinstance(file, SupplementaryFile): self.assertEqual(file.format, 'pdf') # noinspection PyDeprecation self.assertEqual(file.format, file.file_format)
def test_sequencing_process_paired_end(self): uuid = '6b498499-c5b4-452f-9ff9-2318dbb86000' version = '2019-01-03T163633.780215Z' manifest, metadata_files = self._canned_bundle('prod', uuid, version) bundle = Bundle(uuid, version, manifest, metadata_files) sequencing_protocols = [ p for p in bundle.protocols.values() if isinstance(p, SequencingProtocol) ] self.assertEqual(len(sequencing_protocols), 1) self.assertEqual(sequencing_protocols[0].paired_end, True)
def assert_bundle(): bundle = Bundle(uuid, version, manifest, metadata_files) destination_types = { link.destination_type for link in bundle.links } expected_types = { 'library_preparation_protocol', 'sequencing_protocol', 'dissociation_protocol', 'differentiation_protocol', 'ipsc_induction_protocol', 'biomaterial', 'process', 'file' } self.assertEqual(destination_types, expected_types)
def test_ss2_bundle_vx( test_ss2_bundle_uuid_vx, test_ss2_bundle_version_vx, test_ss2_bundle_manifest_vx, ss2_metadata_files_vx, ): return Bundle( uuid=test_ss2_bundle_uuid_vx, version=test_ss2_bundle_version_vx, manifest=test_ss2_bundle_manifest_vx, metadata_files=ss2_metadata_files_vx, )
def test_tenx_bundle_vx_with_no_expected_cell_count( test_tenx_bundle_uuid_vx, test_tenx_bundle_version_vx, test_tenx_bundle_manifest_vx, tenx_metadata_files_vx_with_no_expected_cell_count, ): return Bundle( uuid=test_tenx_bundle_uuid_vx, version=test_tenx_bundle_version_vx, manifest=test_tenx_bundle_manifest_vx, metadata_files=tenx_metadata_files_vx_with_no_expected_cell_count, )
def test_imaging_protocol(self): uuid = '94f2ba52-30c8-4de0-a78e-f95a3f8deb9c' version = '2019-04-03T103426.471000Z' manifest, metadata_files = self._canned_bundle('staging', uuid, version) bundle = Bundle(uuid, version, manifest, metadata_files) imaging_protocol = one([ p for p in bundle.protocols.values() if isinstance(p, ImagingProtocol) ]) self.assertEqual(len(imaging_protocol.probe), 240) assay_types = {probe.assay_type for probe in imaging_protocol.probe} self.assertEqual(assay_types, {'in situ sequencing'})
def test_analysis_protocol(self): uuid = 'ffee7f29-5c38-461a-8771-a68e20ec4a2e' version = '2019-02-02T065454.662896Z' manifest, metadata_files = self._canned_bundle('prod', uuid, version) bundle = Bundle(uuid, version, manifest, metadata_files) analysis_protocols = [ p for p in bundle.protocols.values() if isinstance(p, AnalysisProtocol) ] self.assertEqual(len(analysis_protocols), 1) self.assertEqual(str(analysis_protocols[0].document_id), 'bb17ee61-193e-4ae1-a014-4f1b1c19b8b7') self.assertEqual(analysis_protocols[0].protocol_id, 'smartseq2_v2.2.0') self.assertEqual(analysis_protocols[0].protocol_name, None)
def assert_bundle(): bundle = Bundle(uuid, version, manifest, metadata_files) project = bundle.projects[UUID( 'd96c2451-6e22-441f-a3e6-70fd0878bb1b')] self.assertEqual(len(project.contributors), 5) expected_names = { 'Sabina,,Kanton', 'Barbara,,Treutlein', 'J,Gray,Camp', 'Mallory,Ann,Freeberg', 'Zhisong,,He' } self.assertEqual({c.name for c in project.contributors}, expected_names) # noinspection PyDeprecation self.assertEqual({c.contact_name for c in project.contributors}, expected_names)
def test_canned_staging_area(self): ref = '55628953e4b3a24a7d7798569b6082032bd07a6b' url = f'https://github.com/HumanCellAtlas/schema-test-data/tree/{ref}/tests' factory = GitHubStagingAreaFactory.from_url(url) staging_area = factory.load_staging_area() self.assertGreater(len(staging_area.links), 0) for link_id in staging_area.links: with self.subTest(link_id=link_id): version, manifest, metadata_files = staging_area.get_bundle( link_id) bundle = Bundle(link_id, version, manifest, metadata_files) self.assertEqual(bundle.uuid, UUID(link_id)) project = bundle.projects[UUID( '90bf705c-d891-5ce2-aa54-094488b445c6')] self.assertEqual(project.estimated_cell_count, 10000)
def test_imaging_protocol(self): uuid = '94f2ba52-30c8-4de0-a78e-f95a3f8deb9c' version = '2019-04-03T103426.471000Z' manifest, metadata_files = self._load_bundle(uuid, version, replica='aws', deployment='staging') bundle = Bundle(uuid, version, manifest, metadata_files) imaging_protocol = one([ p for p in bundle.protocols.values() if isinstance(p, ImagingProtocol) ]) self.assertEqual(len(imaging_protocol.target), 240) assay_types = {target.assay_type for target in imaging_protocol.target} self.assertEqual(assay_types, {'in situ sequencing'})
def test_cell_line(self): uuid = 'ffee3a9b-14de-4dda-980f-c08092b2dabe' version = '2019-04-17T175706.867000Z' manifest, metadata_files = self._canned_bundle('prod', uuid, version) bundle = Bundle(uuid, version, manifest, metadata_files) cell_lines = [ cl for cl in bundle.biomaterials.values() if isinstance(cl, CellLine) ] self.assertEqual(len(cell_lines), 1) self.assertEqual(str(cell_lines[0].document_id), '961092cd-dcff-4b59-a0d2-ceeef0aece74') self.assertEqual(cell_lines[0].biomaterial_id, 'cell_line_at_day_54') self.assertEqual(cell_lines[0].has_input_biomaterial, None) self.assertEqual(cell_lines[0].type, 'stem cell-derived') # noinspection PyDeprecation self.assertEqual(cell_lines[0].type, cell_lines[0].cell_line_type) self.assertEqual(cell_lines[0].model_organ, 'brain')
def test_name_substitution(self): uuid = 'ffee7f29-5c38-461a-8771-a68e20ec4a2e' version = '2019-02-02T065454.662896Z' manifest, metadata_files = self._canned_bundle('prod', uuid, version) files_before = [f['name'] for f in manifest] with_bang_before = set(f for f in files_before if '!' in f) expected_bang_before = { '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!.zattrs', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!.zgroup', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_id!.zarray', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_id!0', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_numeric!.zarray', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_numeric!0.0', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_numeric_name!.zarray', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_numeric_name!0', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_string!.zarray', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_string!0.0', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_string_name!.zarray', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_string_name!0', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!expression!.zarray', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!expression!0.0', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!gene_id!.zarray', '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!gene_id!0', } self.assertEqual(expected_bang_before, with_bang_before) with_slash_before = set(f for f in files_before if '/' in f) self.assertEqual(set(), with_slash_before) bundle = Bundle(uuid, version, manifest, metadata_files) expected_slash_after = set( f1.replace('!', '/') for f1 in with_bang_before) entity_json_file_names = set( e.json['file_core']['file_name'] for e in bundle.entities.values() if isinstance(e, (AnalysisFile, SequenceFile))) for files_after in set(bundle.manifest.keys()), entity_json_file_names: with_bang_after = set(f1 for f1 in files_after if '!' in f1) self.assertEqual(set(), with_bang_after) with_slash_after = set(f1 for f1 in files_after if '/' in f1) self.assertEqual(expected_slash_after, with_slash_after)
def test_links_json_v2_0_0(self): """ Test a bundle with a v2.0.0 links.json and supplementary_file links """ uuid = 'cc0b5aa4-9f66-48d2-aa4f-ed019d1c9439' version = '2019-05-15T222432.561000Z' manifest, metadata_files = self._canned_bundle('prod', uuid, version) bundle = Bundle(uuid, version, manifest, metadata_files) for expected_count, link_type in [(6, 'process_link'), (2, 'supplementary_file_link')]: actual_count = sum( [1 for link in bundle.links if link.link_type == link_type]) self.assertEqual(expected_count, actual_count) for link in bundle.links: self.assertIn(link.source_type, api_entity_types) self.assertIn(link.source_id, bundle.entities) self.assertIsInstance(bundle.entities[link.source_id], api_entity_types[link.source_type]) self.assertIn(link.destination_type, api_entity_types) self.assertIn(link.destination_id, bundle.entities) self.assertIsInstance(bundle.entities[link.destination_id], api_entity_types[link.destination_type])
def get_bundle_metadata(uuid, version, dss_url, directurls=False): """Factory function to create a `humancellatlas.data.metadata.Bundle` object from bundle information and manifest. Args: bundle_uuid (str): The bundle uuid. bundle_version (str): The bundle version. dss_url (str): Url of Data Storage System to query Returns: humancellatlas.data.metadata.Bundle: A bundle metadata object. """ dss_deployment = dss_url.split('.')[1] if dss_deployment not in ('dev', 'integration', 'staging'): # dss_client constructor defaults to the production deployment client = dss_client() else: client = dss_client(deployment=dss_deployment) version, manifest, metadata_files = download_bundle_metadata( client=client, replica='gcp', uuid=uuid, version=version, directurls=directurls ) return Bundle( uuid=uuid, version=version, manifest=manifest, metadata_files=metadata_files )
def to_json(fqid): uuid, _, version = fqid.partition('.') version, manifest, metadata_files = download_bundle_metadata( client, 'aws', uuid, version, num_workers=0) bundle = Bundle(uuid, version, manifest, metadata_files) return as_json(bundle)
def _assert_bundle(self, uuid, version, manifest, metadata_files, age_range=None, diseases=frozenset({None}), project_roles=frozenset({None}), storage_methods=frozenset({None}), preservation_methods=frozenset({None}), library_construction_methods=frozenset(), selected_cell_types=frozenset(), accessions=frozenset(), insdc_project_accessions=frozenset(), geo_series_accessions=frozenset(), array_express_accessions=frozenset(), insdc_study_accessions=frozenset(), is_sequencing_bundle=True, slice_thickness=None, ncbi_taxon_ids=None, content_description=None) -> Bundle: bundle = Bundle(uuid, version, manifest, metadata_files) # Every data file's manifest entry should be referenced by a metadata # entity that describes the data file. id() is used to work around the # fact that dict instances aren't hashable and to ensure that no # redundant copies are made. self.assertEqual( set(id(f.manifest_entry.json) for f in bundle.files.values()), set(id(me) for me in manifest if not me['indexed'])) biomaterials = bundle.biomaterials.values() if ncbi_taxon_ids is not None: self.assertSetEqual( ncbi_taxon_ids, set(chain(*(b.ncbi_taxon_id for b in biomaterials)))) actual_diseases = set( chain(*(bm.diseases for bm in biomaterials if isinstance(bm, (DonorOrganism, SpecimenFromOrganism))))) # noinspection PyDeprecation actual_disease = set( chain(*(bm.disease for bm in biomaterials if isinstance(bm, (DonorOrganism, SpecimenFromOrganism))))) self.assertEqual(actual_diseases, diseases) self.assertEqual(actual_diseases, actual_disease) self.assertEqual(str(bundle.uuid), uuid) self.assertEqual(bundle.version, version) self.assertEqual(1, len(bundle.projects)) if selected_cell_types is not None: cell_suspension = next(x for x in bundle.biomaterials.values() if isinstance(x, CellSuspension)) self.assertEqual(CellSuspension, type(cell_suspension)) self.assertEqual(selected_cell_types, cell_suspension.selected_cell_types) # noinspection PyDeprecation self.assertEqual(cell_suspension.selected_cell_types, cell_suspension.selected_cell_type) # noinspection PyDeprecation self.assertEqual(cell_suspension.estimated_cell_count, cell_suspension.total_estimated_cells) project = list(bundle.projects.values())[0] self.assertEqual(Project, type(project)) self.assertEqual(project_roles, {c.project_role for c in project.contributors}) # noinspection PyDeprecation self.assertLessEqual(len(project.laboratory_names), len(project.contributors)) # noinspection PyDeprecation self.assertEqual(project.project_short_name, project.project_shortname) self.assertEqual(insdc_project_accessions, project.insdc_project_accessions) self.assertEqual(geo_series_accessions, project.geo_series_accessions) self.assertEqual(array_express_accessions, project.array_express_accessions) self.assertEqual(insdc_study_accessions, project.insdc_study_accessions) self.assertEqual(accessions, project.accessions) root_entities = bundle.root_entities().values() root_entity_types = {type(e) for e in root_entities} self.assertIn(DonorOrganism, root_entity_types) self.assertTrue({DonorOrganism, SupplementaryFile}.issuperset(root_entity_types)) root_entity = next(iter(root_entities)) self.assertRegex(root_entity.address, 'donor_organism@.*') self.assertIsInstance(root_entity, DonorOrganism) self.assertEqual(root_entity.organism_age_in_seconds, age_range) self.assertTrue(root_entity.sex in ('female', 'male', 'unknown')) # noinspection PyDeprecation self.assertEqual(root_entity.sex, root_entity.biological_sex) if is_sequencing_bundle: sequencing_input = bundle.sequencing_input self.assertGreater( len(sequencing_input), 0, "There should be at least one sequencing input") self.assertEqual( len(set(si.document_id for si in sequencing_input)), len(sequencing_input), "Sequencing inputs should be distinct entities") self.assertEqual( len(set(si.biomaterial_id for si in sequencing_input)), len(sequencing_input), "Sequencing inputs should have distinct biomaterial IDs") self.assertTrue( all(isinstance(si, Biomaterial) for si in sequencing_input), "All sequencing inputs should be instances of Biomaterial") sequencing_input_schema_names = set(si.schema_name for si in sequencing_input) self.assertTrue({ 'cell_suspension', 'specimen_from_organism' }.issuperset( sequencing_input_schema_names ), "The sequencing inputs in the test bundle are of specific schemas" ) sequencing_output = bundle.sequencing_output self.assertGreater( len(sequencing_output), 0, "There should be at least one sequencing output") self.assertEqual( len(set(so.document_id for so in sequencing_output)), len(sequencing_output), "Sequencing outputs should be distinct entities") self.assertTrue( all(isinstance(so, SequenceFile) for so in sequencing_output), "All sequencing outputs should be instances of SequenceFile") self.assertTrue( all( so.manifest_entry.name.endswith('.fastq.gz') for so in sequencing_output), "All sequencing outputs in the test bundle are fastq files.") has_specimens = storage_methods or preservation_methods specimen_types = {type(s) for s in bundle.specimens} self.assertEqual({SpecimenFromOrganism} if has_specimens else set(), specimen_types) self.assertEqual(storage_methods, {s.storage_method for s in bundle.specimens}) self.assertEqual(preservation_methods, {s.preservation_method for s in bundle.specimens}) if has_specimens: # noinspection PyDeprecation self.assertRaises(AttributeError, lambda: bundle.specimens[0].organ_part) # Prove that as_json returns a valid JSON structure (no cycles, correct types, etc.) self.assertTrue( isinstance(json.dumps(as_json(bundle), default=str), str)) library_prep_protos = [ p for p in bundle.protocols.values() if isinstance(p, LibraryPreparationProtocol) ] library_prep_proto_types = {type(p) for p in library_prep_protos} has_library_preps = library_construction_methods != set() or len( library_prep_protos) > 0 self.assertEqual( {LibraryPreparationProtocol} if has_library_preps else set(), library_prep_proto_types) self.assertEqual( library_construction_methods, {p.library_construction_method for p in library_prep_protos}) # noinspection PyDeprecation self.assertEqual( library_construction_methods, {p.library_construction_approach for p in library_prep_protos}) if slice_thickness is not None: self.assertEqual(slice_thickness, [ s.slice_thickness for s in bundle.entities.values() if isinstance(s, ImagedSpecimen) ]) if content_description is not None: self.assertSetEqual( content_description, set( chain.from_iterable(file.content_description for file in bundle.files.values()))) return bundle
def main(argv): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '--dss-url', '-u', default=config.dss_endpoint, help= 'The URL of the DSS REST API endpoint from which to download the bundle to be canned ' '(default: %(default)s).') parser.add_argument( '--replica', '-r', default='aws', help= "The replica from which to donwload the bundle to be canned (default: %(default)s)." ) parser.add_argument('--uuid', '-b', required=True, help='The UUID of the bundle to can.') parser.add_argument( '--version', '-v', help='The version of the bundle to can (default: the latest version).' ) parser.add_argument( '--output-dir', '-O', default=os.path.join(config.project_root, 'test', 'indexer', 'data'), help='The path to the output directory (default: %(default)s).') parser.add_argument( '--api-json', '-A', default=False, action='store_true', help= "Dump the return value of metadata-api's as_json function (default off)." ) args = parser.parse_args(argv) dss_client = azul.dss.direct_access_client( dss_endpoint=args.dss_url, num_workers=config.num_dss_workers) version, manifest, metadata_files = download_bundle_metadata( client=dss_client, replica=args.replica, uuid=args.uuid, version=args.version, num_workers=config.num_dss_workers) logger.info('Downloaded bundle %s version %s from replica %s.', args.uuid, version, args.replica) api_json = as_json(Bundle(args.uuid, version, manifest, metadata_files)) if args.api_json else None for obj, suffix in [(manifest, ".manifest.json"), (metadata_files, '.metadata.json'), *([(api_json, ".api.json")] if api_json else [])]: path = os.path.join(args.output_dir, args.uuid + suffix) with write_file_atomically(path) as f: json.dump(obj, f, indent=4) logger.info("Successfully wrote %s", path)