Ejemplo n.º 1
0
    def _assert_bundle(self,
                       uuid,
                       version,
                       manifest,
                       metadata_files,
                       age_range=None,
                       diseases=frozenset({None}),
                       project_roles=frozenset({None}),
                       storage_methods=frozenset({None}),
                       preservation_methods=frozenset({None}),
                       library_construction_methods=frozenset(),
                       selected_cell_types=frozenset(),
                       accessions=frozenset(),
                       insdc_project_accessions=frozenset(),
                       geo_series_accessions=frozenset(),
                       array_express_accessions=frozenset(),
                       insdc_study_accessions=frozenset(),
                       is_sequencing_bundle=True,
                       slice_thickness=None,
                       ncbi_taxon_ids=None,
                       content_description=None) -> Bundle:
        bundle = Bundle(uuid, version, manifest, metadata_files)

        # Every data file's manifest entry should be referenced by a metadata
        # entity that describes the data file. id() is used to work around the
        # fact that dict instances aren't hashable and to ensure that no
        # redundant copies are made.
        self.assertEqual(
            set(id(f.manifest_entry.json) for f in bundle.files.values()),
            set(id(me) for me in manifest if not me['indexed']))

        biomaterials = bundle.biomaterials.values()

        if ncbi_taxon_ids is not None:
            self.assertSetEqual(
                ncbi_taxon_ids,
                set(chain(*(b.ncbi_taxon_id for b in biomaterials))))

        actual_diseases = set(
            chain(*(bm.diseases for bm in biomaterials
                    if isinstance(bm, (DonorOrganism, SpecimenFromOrganism)))))
        # noinspection PyDeprecation
        actual_disease = set(
            chain(*(bm.disease for bm in biomaterials
                    if isinstance(bm, (DonorOrganism, SpecimenFromOrganism)))))
        self.assertEqual(actual_diseases, diseases)
        self.assertEqual(actual_diseases, actual_disease)
        self.assertEqual(str(bundle.uuid), uuid)
        self.assertEqual(bundle.version, version)
        self.assertEqual(1, len(bundle.projects))

        if selected_cell_types is not None:
            cell_suspension = next(x for x in bundle.biomaterials.values()
                                   if isinstance(x, CellSuspension))
            self.assertEqual(CellSuspension, type(cell_suspension))
            self.assertEqual(selected_cell_types,
                             cell_suspension.selected_cell_types)
            # noinspection PyDeprecation
            self.assertEqual(cell_suspension.selected_cell_types,
                             cell_suspension.selected_cell_type)
            # noinspection PyDeprecation
            self.assertEqual(cell_suspension.estimated_cell_count,
                             cell_suspension.total_estimated_cells)

        project = list(bundle.projects.values())[0]
        self.assertEqual(Project, type(project))
        self.assertEqual(project_roles,
                         {c.project_role
                          for c in project.contributors})
        # noinspection PyDeprecation
        self.assertLessEqual(len(project.laboratory_names),
                             len(project.contributors))
        # noinspection PyDeprecation
        self.assertEqual(project.project_short_name, project.project_shortname)

        self.assertEqual(insdc_project_accessions,
                         project.insdc_project_accessions)
        self.assertEqual(geo_series_accessions, project.geo_series_accessions)
        self.assertEqual(array_express_accessions,
                         project.array_express_accessions)
        self.assertEqual(insdc_study_accessions,
                         project.insdc_study_accessions)
        self.assertEqual(accessions, project.accessions)

        root_entities = bundle.root_entities().values()
        root_entity_types = {type(e) for e in root_entities}
        self.assertIn(DonorOrganism, root_entity_types)
        self.assertTrue({DonorOrganism,
                         SupplementaryFile}.issuperset(root_entity_types))
        root_entity = next(iter(root_entities))
        self.assertRegex(root_entity.address, 'donor_organism@.*')
        self.assertIsInstance(root_entity, DonorOrganism)
        self.assertEqual(root_entity.organism_age_in_seconds, age_range)
        self.assertTrue(root_entity.sex in ('female', 'male', 'unknown'))
        # noinspection PyDeprecation
        self.assertEqual(root_entity.sex, root_entity.biological_sex)

        if is_sequencing_bundle:
            sequencing_input = bundle.sequencing_input
            self.assertGreater(
                len(sequencing_input), 0,
                "There should be at least one sequencing input")
            self.assertEqual(
                len(set(si.document_id for si in sequencing_input)),
                len(sequencing_input),
                "Sequencing inputs should be distinct entities")
            self.assertEqual(
                len(set(si.biomaterial_id for si in sequencing_input)),
                len(sequencing_input),
                "Sequencing inputs should have distinct biomaterial IDs")
            self.assertTrue(
                all(isinstance(si, Biomaterial) for si in sequencing_input),
                "All sequencing inputs should be instances of Biomaterial")
            sequencing_input_schema_names = set(si.schema_name
                                                for si in sequencing_input)
            self.assertTrue({
                'cell_suspension', 'specimen_from_organism'
            }.issuperset(
                sequencing_input_schema_names
            ), "The sequencing inputs in the test bundle are of specific schemas"
                            )

            sequencing_output = bundle.sequencing_output
            self.assertGreater(
                len(sequencing_output), 0,
                "There should be at least one sequencing output")
            self.assertEqual(
                len(set(so.document_id for so in sequencing_output)),
                len(sequencing_output),
                "Sequencing outputs should be distinct entities")
            self.assertTrue(
                all(isinstance(so, SequenceFile) for so in sequencing_output),
                "All sequencing outputs should be instances of SequenceFile")
            self.assertTrue(
                all(
                    so.manifest_entry.name.endswith('.fastq.gz')
                    for so in sequencing_output),
                "All sequencing outputs in the test bundle are fastq files.")

        has_specimens = storage_methods or preservation_methods
        specimen_types = {type(s) for s in bundle.specimens}
        self.assertEqual({SpecimenFromOrganism} if has_specimens else set(),
                         specimen_types)

        self.assertEqual(storage_methods,
                         {s.storage_method
                          for s in bundle.specimens})
        self.assertEqual(preservation_methods,
                         {s.preservation_method
                          for s in bundle.specimens})

        if has_specimens:
            # noinspection PyDeprecation
            self.assertRaises(AttributeError,
                              lambda: bundle.specimens[0].organ_part)

        # Prove that as_json returns a valid JSON structure (no cycles, correct types, etc.)
        self.assertTrue(
            isinstance(json.dumps(as_json(bundle), default=str), str))

        library_prep_protos = [
            p for p in bundle.protocols.values()
            if isinstance(p, LibraryPreparationProtocol)
        ]
        library_prep_proto_types = {type(p) for p in library_prep_protos}
        has_library_preps = library_construction_methods != set() or len(
            library_prep_protos) > 0
        self.assertEqual(
            {LibraryPreparationProtocol} if has_library_preps else set(),
            library_prep_proto_types)
        self.assertEqual(
            library_construction_methods,
            {p.library_construction_method
             for p in library_prep_protos})
        # noinspection PyDeprecation
        self.assertEqual(
            library_construction_methods,
            {p.library_construction_approach
             for p in library_prep_protos})

        if slice_thickness is not None:
            self.assertEqual(slice_thickness, [
                s.slice_thickness for s in bundle.entities.values()
                if isinstance(s, ImagedSpecimen)
            ])

        if content_description is not None:
            self.assertSetEqual(
                content_description,
                set(
                    chain.from_iterable(file.content_description
                                        for file in bundle.files.values())))

        return bundle
Ejemplo n.º 2
0
def main(argv):
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        '--dss-url',
        '-u',
        default=config.dss_endpoint,
        help=
        'The URL of the DSS REST API endpoint from which to download the bundle to be canned '
        '(default: %(default)s).')
    parser.add_argument(
        '--replica',
        '-r',
        default='aws',
        help=
        "The replica from which to donwload the bundle to be canned (default: %(default)s)."
    )
    parser.add_argument('--uuid',
                        '-b',
                        required=True,
                        help='The UUID of the bundle to can.')
    parser.add_argument(
        '--version',
        '-v',
        help='The version of the bundle to can  (default: the latest version).'
    )
    parser.add_argument(
        '--output-dir',
        '-O',
        default=os.path.join(config.project_root, 'test', 'indexer', 'data'),
        help='The path to the output directory (default: %(default)s).')
    parser.add_argument(
        '--api-json',
        '-A',
        default=False,
        action='store_true',
        help=
        "Dump the return value of metadata-api's as_json function (default off)."
    )
    args = parser.parse_args(argv)

    dss_client = azul.dss.direct_access_client(
        dss_endpoint=args.dss_url, num_workers=config.num_dss_workers)
    version, manifest, metadata_files = download_bundle_metadata(
        client=dss_client,
        replica=args.replica,
        uuid=args.uuid,
        version=args.version,
        num_workers=config.num_dss_workers)
    logger.info('Downloaded bundle %s version %s from replica %s.', args.uuid,
                version, args.replica)

    api_json = as_json(Bundle(args.uuid, version, manifest,
                              metadata_files)) if args.api_json else None

    for obj, suffix in [(manifest, ".manifest.json"),
                        (metadata_files, '.metadata.json'),
                        *([(api_json, ".api.json")] if api_json else [])]:
        path = os.path.join(args.output_dir, args.uuid + suffix)
        with write_file_atomically(path) as f:
            json.dump(obj, f, indent=4)
        logger.info("Successfully wrote %s", path)
Ejemplo n.º 3
0
 def to_json(fqid):
     uuid, _, version = fqid.partition('.')
     version, manifest, metadata_files = download_bundle_metadata(
         client, 'aws', uuid, version, num_workers=0)
     bundle = Bundle(uuid, version, manifest, metadata_files)
     return as_json(bundle)
Ejemplo n.º 4
0
    def test_one_bundle(self):
        for deployment, replica, uuid, version, age_range in [
                # A v5 bundle
            (None, 'aws', 'b2216048-7eaa-45f4-8077-5a3fb4204953', None,
             AgeRange(min=3628800, max=7257600)),
                # A vx bundle with a cell_suspension as sequencing input
            ('integration', 'aws', '1e276fdd-d885-4a18-b5b8-df33f1347c1a',
             '2018-08-03T082009.272868Z', None),
                # A vx bundle with a specimen_from_organism as sequencing input
            ('integration', 'aws', '17ef531b-1bb7-425d-bbf7-32721242dde7',
             '2018-08-17T203538.886280Z', None),
        ]:
            with self.subTest(deployment=deployment,
                              replica=replica,
                              uuid=uuid,
                              age_range=age_range):
                client = dss_client(deployment)
                version, manifest, metadata_files = download_bundle_metadata(
                    client, replica, uuid, version)
                bundle = Bundle(uuid, version, manifest, metadata_files)
                self.assertEqual(str(bundle.uuid), uuid)
                self.assertEqual(bundle.version, version)
                self.assertEqual(1, len(bundle.projects))
                self.assertEqual({Project},
                                 {type(e)
                                  for e in bundle.projects.values()})
                root_entities = bundle.root_entities().values()
                self.assertEqual({DonorOrganism},
                                 {type(e)
                                  for e in root_entities})
                root_entity = next(iter(root_entities))
                self.assertRegex(root_entity.address, 'donor_organism@.*')
                self.assertIsInstance(root_entity, DonorOrganism)
                self.assertEqual(root_entity.organism_age_in_seconds,
                                 age_range)
                self.assertTrue(root_entity.sex in ('female', 'unknown'))

                sequencing_input = bundle.sequencing_input
                self.assertGreater(
                    len(sequencing_input), 0,
                    "There should be at least one sequencing input")
                self.assertEqual(
                    len(set(si.document_id for si in sequencing_input)),
                    len(sequencing_input),
                    "Sequencing inputs should be distinct entities")
                self.assertEqual(
                    len(set(si.biomaterial_id for si in sequencing_input)),
                    len(sequencing_input),
                    "Sequencing inputs should have distinct biomaterial IDs")
                self.assertTrue(
                    all(
                        isinstance(si, Biomaterial)
                        for si in sequencing_input),
                    "All sequencing inputs should be instances of Biomaterial")
                sequencing_input_schema_names = set(si.schema_name
                                                    for si in sequencing_input)
                self.assertTrue({
                    'cell_suspension', 'specimen_from_organism'
                }.issuperset(
                    sequencing_input_schema_names
                ), "The sequencing inputs in the test bundle are of specific schemas"
                                )

                sequencing_output = bundle.sequencing_output
                self.assertGreater(
                    len(sequencing_output), 0,
                    "There should be at least one sequencing output")
                self.assertEqual(
                    len(set(so.document_id for so in sequencing_output)),
                    len(sequencing_output),
                    "Sequencing outputs should be distinct entities")
                self.assertTrue(
                    all(
                        isinstance(so, SequenceFile)
                        for so in sequencing_output),
                    "All sequencing outputs should be instances of SequenceFile"
                )
                self.assertTrue(
                    all(
                        so.manifest_entry.name.endswith('.fastq.gz')
                        for so in sequencing_output),
                    "All sequencing outputs in the test bundle are fastq files."
                )

                print(json.dumps(as_json(bundle), indent=4))

                self.assertEqual({SpecimenFromOrganism},
                                 {type(s)
                                  for s in bundle.specimens})