def _assert_bundle(self, uuid, version, manifest, metadata_files, age_range=None, diseases=frozenset({None}), project_roles=frozenset({None}), storage_methods=frozenset({None}), preservation_methods=frozenset({None}), library_construction_methods=frozenset(), selected_cell_types=frozenset(), accessions=frozenset(), insdc_project_accessions=frozenset(), geo_series_accessions=frozenset(), array_express_accessions=frozenset(), insdc_study_accessions=frozenset(), is_sequencing_bundle=True, slice_thickness=None, ncbi_taxon_ids=None, content_description=None) -> Bundle: bundle = Bundle(uuid, version, manifest, metadata_files) # Every data file's manifest entry should be referenced by a metadata # entity that describes the data file. id() is used to work around the # fact that dict instances aren't hashable and to ensure that no # redundant copies are made. self.assertEqual( set(id(f.manifest_entry.json) for f in bundle.files.values()), set(id(me) for me in manifest if not me['indexed'])) biomaterials = bundle.biomaterials.values() if ncbi_taxon_ids is not None: self.assertSetEqual( ncbi_taxon_ids, set(chain(*(b.ncbi_taxon_id for b in biomaterials)))) actual_diseases = set( chain(*(bm.diseases for bm in biomaterials if isinstance(bm, (DonorOrganism, SpecimenFromOrganism))))) # noinspection PyDeprecation actual_disease = set( chain(*(bm.disease for bm in biomaterials if isinstance(bm, (DonorOrganism, SpecimenFromOrganism))))) self.assertEqual(actual_diseases, diseases) self.assertEqual(actual_diseases, actual_disease) self.assertEqual(str(bundle.uuid), uuid) self.assertEqual(bundle.version, version) self.assertEqual(1, len(bundle.projects)) if selected_cell_types is not None: cell_suspension = next(x for x in bundle.biomaterials.values() if isinstance(x, CellSuspension)) self.assertEqual(CellSuspension, type(cell_suspension)) self.assertEqual(selected_cell_types, cell_suspension.selected_cell_types) # noinspection PyDeprecation self.assertEqual(cell_suspension.selected_cell_types, cell_suspension.selected_cell_type) # noinspection PyDeprecation self.assertEqual(cell_suspension.estimated_cell_count, cell_suspension.total_estimated_cells) project = list(bundle.projects.values())[0] self.assertEqual(Project, type(project)) self.assertEqual(project_roles, {c.project_role for c in project.contributors}) # noinspection PyDeprecation self.assertLessEqual(len(project.laboratory_names), len(project.contributors)) # noinspection PyDeprecation self.assertEqual(project.project_short_name, project.project_shortname) self.assertEqual(insdc_project_accessions, project.insdc_project_accessions) self.assertEqual(geo_series_accessions, project.geo_series_accessions) self.assertEqual(array_express_accessions, project.array_express_accessions) self.assertEqual(insdc_study_accessions, project.insdc_study_accessions) self.assertEqual(accessions, project.accessions) root_entities = bundle.root_entities().values() root_entity_types = {type(e) for e in root_entities} self.assertIn(DonorOrganism, root_entity_types) self.assertTrue({DonorOrganism, SupplementaryFile}.issuperset(root_entity_types)) root_entity = next(iter(root_entities)) self.assertRegex(root_entity.address, 'donor_organism@.*') self.assertIsInstance(root_entity, DonorOrganism) self.assertEqual(root_entity.organism_age_in_seconds, age_range) self.assertTrue(root_entity.sex in ('female', 'male', 'unknown')) # noinspection PyDeprecation self.assertEqual(root_entity.sex, root_entity.biological_sex) if is_sequencing_bundle: sequencing_input = bundle.sequencing_input self.assertGreater( len(sequencing_input), 0, "There should be at least one sequencing input") self.assertEqual( len(set(si.document_id for si in sequencing_input)), len(sequencing_input), "Sequencing inputs should be distinct entities") self.assertEqual( len(set(si.biomaterial_id for si in sequencing_input)), len(sequencing_input), "Sequencing inputs should have distinct biomaterial IDs") self.assertTrue( all(isinstance(si, Biomaterial) for si in sequencing_input), "All sequencing inputs should be instances of Biomaterial") sequencing_input_schema_names = set(si.schema_name for si in sequencing_input) self.assertTrue({ 'cell_suspension', 'specimen_from_organism' }.issuperset( sequencing_input_schema_names ), "The sequencing inputs in the test bundle are of specific schemas" ) sequencing_output = bundle.sequencing_output self.assertGreater( len(sequencing_output), 0, "There should be at least one sequencing output") self.assertEqual( len(set(so.document_id for so in sequencing_output)), len(sequencing_output), "Sequencing outputs should be distinct entities") self.assertTrue( all(isinstance(so, SequenceFile) for so in sequencing_output), "All sequencing outputs should be instances of SequenceFile") self.assertTrue( all( so.manifest_entry.name.endswith('.fastq.gz') for so in sequencing_output), "All sequencing outputs in the test bundle are fastq files.") has_specimens = storage_methods or preservation_methods specimen_types = {type(s) for s in bundle.specimens} self.assertEqual({SpecimenFromOrganism} if has_specimens else set(), specimen_types) self.assertEqual(storage_methods, {s.storage_method for s in bundle.specimens}) self.assertEqual(preservation_methods, {s.preservation_method for s in bundle.specimens}) if has_specimens: # noinspection PyDeprecation self.assertRaises(AttributeError, lambda: bundle.specimens[0].organ_part) # Prove that as_json returns a valid JSON structure (no cycles, correct types, etc.) self.assertTrue( isinstance(json.dumps(as_json(bundle), default=str), str)) library_prep_protos = [ p for p in bundle.protocols.values() if isinstance(p, LibraryPreparationProtocol) ] library_prep_proto_types = {type(p) for p in library_prep_protos} has_library_preps = library_construction_methods != set() or len( library_prep_protos) > 0 self.assertEqual( {LibraryPreparationProtocol} if has_library_preps else set(), library_prep_proto_types) self.assertEqual( library_construction_methods, {p.library_construction_method for p in library_prep_protos}) # noinspection PyDeprecation self.assertEqual( library_construction_methods, {p.library_construction_approach for p in library_prep_protos}) if slice_thickness is not None: self.assertEqual(slice_thickness, [ s.slice_thickness for s in bundle.entities.values() if isinstance(s, ImagedSpecimen) ]) if content_description is not None: self.assertSetEqual( content_description, set( chain.from_iterable(file.content_description for file in bundle.files.values()))) return bundle
def main(argv): parser = argparse.ArgumentParser(description=__doc__) parser.add_argument( '--dss-url', '-u', default=config.dss_endpoint, help= 'The URL of the DSS REST API endpoint from which to download the bundle to be canned ' '(default: %(default)s).') parser.add_argument( '--replica', '-r', default='aws', help= "The replica from which to donwload the bundle to be canned (default: %(default)s)." ) parser.add_argument('--uuid', '-b', required=True, help='The UUID of the bundle to can.') parser.add_argument( '--version', '-v', help='The version of the bundle to can (default: the latest version).' ) parser.add_argument( '--output-dir', '-O', default=os.path.join(config.project_root, 'test', 'indexer', 'data'), help='The path to the output directory (default: %(default)s).') parser.add_argument( '--api-json', '-A', default=False, action='store_true', help= "Dump the return value of metadata-api's as_json function (default off)." ) args = parser.parse_args(argv) dss_client = azul.dss.direct_access_client( dss_endpoint=args.dss_url, num_workers=config.num_dss_workers) version, manifest, metadata_files = download_bundle_metadata( client=dss_client, replica=args.replica, uuid=args.uuid, version=args.version, num_workers=config.num_dss_workers) logger.info('Downloaded bundle %s version %s from replica %s.', args.uuid, version, args.replica) api_json = as_json(Bundle(args.uuid, version, manifest, metadata_files)) if args.api_json else None for obj, suffix in [(manifest, ".manifest.json"), (metadata_files, '.metadata.json'), *([(api_json, ".api.json")] if api_json else [])]: path = os.path.join(args.output_dir, args.uuid + suffix) with write_file_atomically(path) as f: json.dump(obj, f, indent=4) logger.info("Successfully wrote %s", path)
def to_json(fqid): uuid, _, version = fqid.partition('.') version, manifest, metadata_files = download_bundle_metadata( client, 'aws', uuid, version, num_workers=0) bundle = Bundle(uuid, version, manifest, metadata_files) return as_json(bundle)
def test_one_bundle(self): for deployment, replica, uuid, version, age_range in [ # A v5 bundle (None, 'aws', 'b2216048-7eaa-45f4-8077-5a3fb4204953', None, AgeRange(min=3628800, max=7257600)), # A vx bundle with a cell_suspension as sequencing input ('integration', 'aws', '1e276fdd-d885-4a18-b5b8-df33f1347c1a', '2018-08-03T082009.272868Z', None), # A vx bundle with a specimen_from_organism as sequencing input ('integration', 'aws', '17ef531b-1bb7-425d-bbf7-32721242dde7', '2018-08-17T203538.886280Z', None), ]: with self.subTest(deployment=deployment, replica=replica, uuid=uuid, age_range=age_range): client = dss_client(deployment) version, manifest, metadata_files = download_bundle_metadata( client, replica, uuid, version) bundle = Bundle(uuid, version, manifest, metadata_files) self.assertEqual(str(bundle.uuid), uuid) self.assertEqual(bundle.version, version) self.assertEqual(1, len(bundle.projects)) self.assertEqual({Project}, {type(e) for e in bundle.projects.values()}) root_entities = bundle.root_entities().values() self.assertEqual({DonorOrganism}, {type(e) for e in root_entities}) root_entity = next(iter(root_entities)) self.assertRegex(root_entity.address, 'donor_organism@.*') self.assertIsInstance(root_entity, DonorOrganism) self.assertEqual(root_entity.organism_age_in_seconds, age_range) self.assertTrue(root_entity.sex in ('female', 'unknown')) sequencing_input = bundle.sequencing_input self.assertGreater( len(sequencing_input), 0, "There should be at least one sequencing input") self.assertEqual( len(set(si.document_id for si in sequencing_input)), len(sequencing_input), "Sequencing inputs should be distinct entities") self.assertEqual( len(set(si.biomaterial_id for si in sequencing_input)), len(sequencing_input), "Sequencing inputs should have distinct biomaterial IDs") self.assertTrue( all( isinstance(si, Biomaterial) for si in sequencing_input), "All sequencing inputs should be instances of Biomaterial") sequencing_input_schema_names = set(si.schema_name for si in sequencing_input) self.assertTrue({ 'cell_suspension', 'specimen_from_organism' }.issuperset( sequencing_input_schema_names ), "The sequencing inputs in the test bundle are of specific schemas" ) sequencing_output = bundle.sequencing_output self.assertGreater( len(sequencing_output), 0, "There should be at least one sequencing output") self.assertEqual( len(set(so.document_id for so in sequencing_output)), len(sequencing_output), "Sequencing outputs should be distinct entities") self.assertTrue( all( isinstance(so, SequenceFile) for so in sequencing_output), "All sequencing outputs should be instances of SequenceFile" ) self.assertTrue( all( so.manifest_entry.name.endswith('.fastq.gz') for so in sequencing_output), "All sequencing outputs in the test bundle are fastq files." ) print(json.dumps(as_json(bundle), indent=4)) self.assertEqual({SpecimenFromOrganism}, {type(s) for s in bundle.specimens})