コード例 #1
0
 def assert_bundle():
     bundle = Bundle(uuid, version, manifest, metadata_files)
     project = bundle.projects[UUID(
         '519b58ef-6462-4ed3-8c0d-375b54f53c31')]
     self.assertEqual(len(project.publications), 1)
     publication = project.publications.pop()
     title = 'Precursors of human CD4+ cytotoxic T lymphocytes identified by single-cell transcriptome analysis.'
     self.assertEqual(publication.title, title)
     # noinspection PyDeprecation
     self.assertEqual(publication.doi, '10.1126/sciimmunol.aan8664')
     self.assertEqual(publication.official_hca, None)
     self.assertEqual(publication.title, publication.publication_title)
     self.assertEqual(
         publication.url,
         'http://immunology.sciencemag.org/content/3/19/eaan8664.long')
     # noinspection PyDeprecation
     self.assertEqual(publication.url, publication.publication_url)
     project_roles = {c.project_role for c in project.contributors}
     self.assertEqual(
         project_roles,
         {None, 'external curator', 'Human Cell Atlas wrangler'})
     supplementary_links = {
         'https://www.ebi.ac.uk/gxa/sc/experiments/E-GEOD-106540/Results'
     }
     self.assertEqual(project.supplementary_links, supplementary_links)
コード例 #2
0
 def test_missing_mandatory_checksums(self):
     uuid = '404f9663-21c6-49ff-afd0-8cfeff816949'
     checksums = []
     cases = [{}, {
         'crc32c': None
     }, {
         'crc32c': 'a'
     }, {
         'crc32c': 'a',
         'sha1': None
     }]
     for case in cases:
         with self.assertRaises(TypeError) as cm:
             Bundle(uuid=uuid,
                    version='',
                    manifest=[{
                        'uuid': uuid,
                        'version': '',
                        'name': '',
                        'size': 0,
                        'indexed': True,
                        'content-type': '',
                        **case
                    }],
                    metadata_files={})
         self.assertEqual(cm.exception.args[0],
                          'Property cannot be absent or None')
         checksums.append(cm.exception.args[1])
     self.assertEqual(['crc32c', 'crc32c', 'sha256', 'sha256'], checksums)
コード例 #3
0
 def assert_bundle():
     bundle = Bundle(uuid, version, manifest, metadata_files)
     self.assertEqual(len(bundle.files), 6)
     for file in bundle.files.values():
         if isinstance(file, SequenceFile):
             self.assertEqual(file.format, 'fastq.gz')
         if isinstance(file, SupplementaryFile):
             self.assertEqual(file.format, 'pdf')
         # noinspection PyDeprecation
         self.assertEqual(file.format, file.file_format)
コード例 #4
0
 def test_sequencing_process_paired_end(self):
     uuid = '6b498499-c5b4-452f-9ff9-2318dbb86000'
     version = '2019-01-03T163633.780215Z'
     manifest, metadata_files = self._canned_bundle('prod', uuid, version)
     bundle = Bundle(uuid, version, manifest, metadata_files)
     sequencing_protocols = [
         p for p in bundle.protocols.values()
         if isinstance(p, SequencingProtocol)
     ]
     self.assertEqual(len(sequencing_protocols), 1)
     self.assertEqual(sequencing_protocols[0].paired_end, True)
コード例 #5
0
 def assert_bundle():
     bundle = Bundle(uuid, version, manifest, metadata_files)
     destination_types = {
         link.destination_type
         for link in bundle.links
     }
     expected_types = {
         'library_preparation_protocol', 'sequencing_protocol',
         'dissociation_protocol', 'differentiation_protocol',
         'ipsc_induction_protocol', 'biomaterial', 'process', 'file'
     }
     self.assertEqual(destination_types, expected_types)
コード例 #6
0
def test_ss2_bundle_vx(
    test_ss2_bundle_uuid_vx,
    test_ss2_bundle_version_vx,
    test_ss2_bundle_manifest_vx,
    ss2_metadata_files_vx,
):
    return Bundle(
        uuid=test_ss2_bundle_uuid_vx,
        version=test_ss2_bundle_version_vx,
        manifest=test_ss2_bundle_manifest_vx,
        metadata_files=ss2_metadata_files_vx,
    )
コード例 #7
0
def test_tenx_bundle_vx_with_no_expected_cell_count(
    test_tenx_bundle_uuid_vx,
    test_tenx_bundle_version_vx,
    test_tenx_bundle_manifest_vx,
    tenx_metadata_files_vx_with_no_expected_cell_count,
):
    return Bundle(
        uuid=test_tenx_bundle_uuid_vx,
        version=test_tenx_bundle_version_vx,
        manifest=test_tenx_bundle_manifest_vx,
        metadata_files=tenx_metadata_files_vx_with_no_expected_cell_count,
    )
コード例 #8
0
 def test_imaging_protocol(self):
     uuid = '94f2ba52-30c8-4de0-a78e-f95a3f8deb9c'
     version = '2019-04-03T103426.471000Z'
     manifest, metadata_files = self._canned_bundle('staging', uuid,
                                                    version)
     bundle = Bundle(uuid, version, manifest, metadata_files)
     imaging_protocol = one([
         p for p in bundle.protocols.values()
         if isinstance(p, ImagingProtocol)
     ])
     self.assertEqual(len(imaging_protocol.probe), 240)
     assay_types = {probe.assay_type for probe in imaging_protocol.probe}
     self.assertEqual(assay_types, {'in situ sequencing'})
コード例 #9
0
 def test_analysis_protocol(self):
     uuid = 'ffee7f29-5c38-461a-8771-a68e20ec4a2e'
     version = '2019-02-02T065454.662896Z'
     manifest, metadata_files = self._canned_bundle('prod', uuid, version)
     bundle = Bundle(uuid, version, manifest, metadata_files)
     analysis_protocols = [
         p for p in bundle.protocols.values()
         if isinstance(p, AnalysisProtocol)
     ]
     self.assertEqual(len(analysis_protocols), 1)
     self.assertEqual(str(analysis_protocols[0].document_id),
                      'bb17ee61-193e-4ae1-a014-4f1b1c19b8b7')
     self.assertEqual(analysis_protocols[0].protocol_id, 'smartseq2_v2.2.0')
     self.assertEqual(analysis_protocols[0].protocol_name, None)
コード例 #10
0
 def assert_bundle():
     bundle = Bundle(uuid, version, manifest, metadata_files)
     project = bundle.projects[UUID(
         'd96c2451-6e22-441f-a3e6-70fd0878bb1b')]
     self.assertEqual(len(project.contributors), 5)
     expected_names = {
         'Sabina,,Kanton', 'Barbara,,Treutlein', 'J,Gray,Camp',
         'Mallory,Ann,Freeberg', 'Zhisong,,He'
     }
     self.assertEqual({c.name
                       for c in project.contributors}, expected_names)
     # noinspection PyDeprecation
     self.assertEqual({c.contact_name
                       for c in project.contributors}, expected_names)
コード例 #11
0
 def test_canned_staging_area(self):
     ref = '55628953e4b3a24a7d7798569b6082032bd07a6b'
     url = f'https://github.com/HumanCellAtlas/schema-test-data/tree/{ref}/tests'
     factory = GitHubStagingAreaFactory.from_url(url)
     staging_area = factory.load_staging_area()
     self.assertGreater(len(staging_area.links), 0)
     for link_id in staging_area.links:
         with self.subTest(link_id=link_id):
             version, manifest, metadata_files = staging_area.get_bundle(
                 link_id)
             bundle = Bundle(link_id, version, manifest, metadata_files)
             self.assertEqual(bundle.uuid, UUID(link_id))
             project = bundle.projects[UUID(
                 '90bf705c-d891-5ce2-aa54-094488b445c6')]
             self.assertEqual(project.estimated_cell_count, 10000)
コード例 #12
0
 def test_imaging_protocol(self):
     uuid = '94f2ba52-30c8-4de0-a78e-f95a3f8deb9c'
     version = '2019-04-03T103426.471000Z'
     manifest, metadata_files = self._load_bundle(uuid,
                                                  version,
                                                  replica='aws',
                                                  deployment='staging')
     bundle = Bundle(uuid, version, manifest, metadata_files)
     imaging_protocol = one([
         p for p in bundle.protocols.values()
         if isinstance(p, ImagingProtocol)
     ])
     self.assertEqual(len(imaging_protocol.target), 240)
     assay_types = {target.assay_type for target in imaging_protocol.target}
     self.assertEqual(assay_types, {'in situ sequencing'})
コード例 #13
0
 def test_cell_line(self):
     uuid = 'ffee3a9b-14de-4dda-980f-c08092b2dabe'
     version = '2019-04-17T175706.867000Z'
     manifest, metadata_files = self._canned_bundle('prod', uuid, version)
     bundle = Bundle(uuid, version, manifest, metadata_files)
     cell_lines = [
         cl for cl in bundle.biomaterials.values()
         if isinstance(cl, CellLine)
     ]
     self.assertEqual(len(cell_lines), 1)
     self.assertEqual(str(cell_lines[0].document_id),
                      '961092cd-dcff-4b59-a0d2-ceeef0aece74')
     self.assertEqual(cell_lines[0].biomaterial_id, 'cell_line_at_day_54')
     self.assertEqual(cell_lines[0].has_input_biomaterial, None)
     self.assertEqual(cell_lines[0].type, 'stem cell-derived')
     # noinspection PyDeprecation
     self.assertEqual(cell_lines[0].type, cell_lines[0].cell_line_type)
     self.assertEqual(cell_lines[0].model_organ, 'brain')
コード例 #14
0
    def test_name_substitution(self):
        uuid = 'ffee7f29-5c38-461a-8771-a68e20ec4a2e'
        version = '2019-02-02T065454.662896Z'
        manifest, metadata_files = self._canned_bundle('prod', uuid, version)

        files_before = [f['name'] for f in manifest]
        with_bang_before = set(f for f in files_before if '!' in f)
        expected_bang_before = {
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!.zattrs',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!.zgroup',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_id!.zarray',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_id!0',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_numeric!.zarray',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_numeric!0.0',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_numeric_name!.zarray',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_numeric_name!0',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_string!.zarray',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_string!0.0',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_string_name!.zarray',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!cell_metadata_string_name!0',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!expression!.zarray',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!expression!0.0',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!gene_id!.zarray',
            '9ea49dd1-7511-48f8-be12-237e3d0690c0.zarr!gene_id!0',
        }
        self.assertEqual(expected_bang_before, with_bang_before)
        with_slash_before = set(f for f in files_before if '/' in f)
        self.assertEqual(set(), with_slash_before)

        bundle = Bundle(uuid, version, manifest, metadata_files)

        expected_slash_after = set(
            f1.replace('!', '/') for f1 in with_bang_before)
        entity_json_file_names = set(
            e.json['file_core']['file_name'] for e in bundle.entities.values()
            if isinstance(e, (AnalysisFile, SequenceFile)))
        for files_after in set(bundle.manifest.keys()), entity_json_file_names:
            with_bang_after = set(f1 for f1 in files_after if '!' in f1)
            self.assertEqual(set(), with_bang_after)
            with_slash_after = set(f1 for f1 in files_after if '/' in f1)
            self.assertEqual(expected_slash_after, with_slash_after)
コード例 #15
0
 def test_links_json_v2_0_0(self):
     """
     Test a bundle with a v2.0.0 links.json and supplementary_file links
     """
     uuid = 'cc0b5aa4-9f66-48d2-aa4f-ed019d1c9439'
     version = '2019-05-15T222432.561000Z'
     manifest, metadata_files = self._canned_bundle('prod', uuid, version)
     bundle = Bundle(uuid, version, manifest, metadata_files)
     for expected_count, link_type in [(6, 'process_link'),
                                       (2, 'supplementary_file_link')]:
         actual_count = sum(
             [1 for link in bundle.links if link.link_type == link_type])
         self.assertEqual(expected_count, actual_count)
     for link in bundle.links:
         self.assertIn(link.source_type, api_entity_types)
         self.assertIn(link.source_id, bundle.entities)
         self.assertIsInstance(bundle.entities[link.source_id],
                               api_entity_types[link.source_type])
         self.assertIn(link.destination_type, api_entity_types)
         self.assertIn(link.destination_id, bundle.entities)
         self.assertIsInstance(bundle.entities[link.destination_id],
                               api_entity_types[link.destination_type])
コード例 #16
0
def get_bundle_metadata(uuid, version, dss_url, directurls=False):
    """Factory function to create a `humancellatlas.data.metadata.Bundle` object from bundle information and manifest.

    Args:
        bundle_uuid (str): The bundle uuid.
        bundle_version (str): The bundle version.
        dss_url (str): Url of Data Storage System to query

    Returns:
        humancellatlas.data.metadata.Bundle: A bundle metadata object.
    """
    dss_deployment = dss_url.split('.')[1]
    if dss_deployment not in ('dev', 'integration', 'staging'):
        # dss_client constructor defaults to the production deployment
        client = dss_client()
    else:
        client = dss_client(deployment=dss_deployment)
    version, manifest, metadata_files = download_bundle_metadata(
        client=client, replica='gcp', uuid=uuid, version=version, directurls=directurls
    )
    return Bundle(
        uuid=uuid, version=version, manifest=manifest, metadata_files=metadata_files
    )
コード例 #17
0
 def to_json(fqid):
     uuid, _, version = fqid.partition('.')
     version, manifest, metadata_files = download_bundle_metadata(
         client, 'aws', uuid, version, num_workers=0)
     bundle = Bundle(uuid, version, manifest, metadata_files)
     return as_json(bundle)
コード例 #18
0
    def _assert_bundle(self,
                       uuid,
                       version,
                       manifest,
                       metadata_files,
                       age_range=None,
                       diseases=frozenset({None}),
                       project_roles=frozenset({None}),
                       storage_methods=frozenset({None}),
                       preservation_methods=frozenset({None}),
                       library_construction_methods=frozenset(),
                       selected_cell_types=frozenset(),
                       accessions=frozenset(),
                       insdc_project_accessions=frozenset(),
                       geo_series_accessions=frozenset(),
                       array_express_accessions=frozenset(),
                       insdc_study_accessions=frozenset(),
                       is_sequencing_bundle=True,
                       slice_thickness=None,
                       ncbi_taxon_ids=None,
                       content_description=None) -> Bundle:
        bundle = Bundle(uuid, version, manifest, metadata_files)

        # Every data file's manifest entry should be referenced by a metadata
        # entity that describes the data file. id() is used to work around the
        # fact that dict instances aren't hashable and to ensure that no
        # redundant copies are made.
        self.assertEqual(
            set(id(f.manifest_entry.json) for f in bundle.files.values()),
            set(id(me) for me in manifest if not me['indexed']))

        biomaterials = bundle.biomaterials.values()

        if ncbi_taxon_ids is not None:
            self.assertSetEqual(
                ncbi_taxon_ids,
                set(chain(*(b.ncbi_taxon_id for b in biomaterials))))

        actual_diseases = set(
            chain(*(bm.diseases for bm in biomaterials
                    if isinstance(bm, (DonorOrganism, SpecimenFromOrganism)))))
        # noinspection PyDeprecation
        actual_disease = set(
            chain(*(bm.disease for bm in biomaterials
                    if isinstance(bm, (DonorOrganism, SpecimenFromOrganism)))))
        self.assertEqual(actual_diseases, diseases)
        self.assertEqual(actual_diseases, actual_disease)
        self.assertEqual(str(bundle.uuid), uuid)
        self.assertEqual(bundle.version, version)
        self.assertEqual(1, len(bundle.projects))

        if selected_cell_types is not None:
            cell_suspension = next(x for x in bundle.biomaterials.values()
                                   if isinstance(x, CellSuspension))
            self.assertEqual(CellSuspension, type(cell_suspension))
            self.assertEqual(selected_cell_types,
                             cell_suspension.selected_cell_types)
            # noinspection PyDeprecation
            self.assertEqual(cell_suspension.selected_cell_types,
                             cell_suspension.selected_cell_type)
            # noinspection PyDeprecation
            self.assertEqual(cell_suspension.estimated_cell_count,
                             cell_suspension.total_estimated_cells)

        project = list(bundle.projects.values())[0]
        self.assertEqual(Project, type(project))
        self.assertEqual(project_roles,
                         {c.project_role
                          for c in project.contributors})
        # noinspection PyDeprecation
        self.assertLessEqual(len(project.laboratory_names),
                             len(project.contributors))
        # noinspection PyDeprecation
        self.assertEqual(project.project_short_name, project.project_shortname)

        self.assertEqual(insdc_project_accessions,
                         project.insdc_project_accessions)
        self.assertEqual(geo_series_accessions, project.geo_series_accessions)
        self.assertEqual(array_express_accessions,
                         project.array_express_accessions)
        self.assertEqual(insdc_study_accessions,
                         project.insdc_study_accessions)
        self.assertEqual(accessions, project.accessions)

        root_entities = bundle.root_entities().values()
        root_entity_types = {type(e) for e in root_entities}
        self.assertIn(DonorOrganism, root_entity_types)
        self.assertTrue({DonorOrganism,
                         SupplementaryFile}.issuperset(root_entity_types))
        root_entity = next(iter(root_entities))
        self.assertRegex(root_entity.address, 'donor_organism@.*')
        self.assertIsInstance(root_entity, DonorOrganism)
        self.assertEqual(root_entity.organism_age_in_seconds, age_range)
        self.assertTrue(root_entity.sex in ('female', 'male', 'unknown'))
        # noinspection PyDeprecation
        self.assertEqual(root_entity.sex, root_entity.biological_sex)

        if is_sequencing_bundle:
            sequencing_input = bundle.sequencing_input
            self.assertGreater(
                len(sequencing_input), 0,
                "There should be at least one sequencing input")
            self.assertEqual(
                len(set(si.document_id for si in sequencing_input)),
                len(sequencing_input),
                "Sequencing inputs should be distinct entities")
            self.assertEqual(
                len(set(si.biomaterial_id for si in sequencing_input)),
                len(sequencing_input),
                "Sequencing inputs should have distinct biomaterial IDs")
            self.assertTrue(
                all(isinstance(si, Biomaterial) for si in sequencing_input),
                "All sequencing inputs should be instances of Biomaterial")
            sequencing_input_schema_names = set(si.schema_name
                                                for si in sequencing_input)
            self.assertTrue({
                'cell_suspension', 'specimen_from_organism'
            }.issuperset(
                sequencing_input_schema_names
            ), "The sequencing inputs in the test bundle are of specific schemas"
                            )

            sequencing_output = bundle.sequencing_output
            self.assertGreater(
                len(sequencing_output), 0,
                "There should be at least one sequencing output")
            self.assertEqual(
                len(set(so.document_id for so in sequencing_output)),
                len(sequencing_output),
                "Sequencing outputs should be distinct entities")
            self.assertTrue(
                all(isinstance(so, SequenceFile) for so in sequencing_output),
                "All sequencing outputs should be instances of SequenceFile")
            self.assertTrue(
                all(
                    so.manifest_entry.name.endswith('.fastq.gz')
                    for so in sequencing_output),
                "All sequencing outputs in the test bundle are fastq files.")

        has_specimens = storage_methods or preservation_methods
        specimen_types = {type(s) for s in bundle.specimens}
        self.assertEqual({SpecimenFromOrganism} if has_specimens else set(),
                         specimen_types)

        self.assertEqual(storage_methods,
                         {s.storage_method
                          for s in bundle.specimens})
        self.assertEqual(preservation_methods,
                         {s.preservation_method
                          for s in bundle.specimens})

        if has_specimens:
            # noinspection PyDeprecation
            self.assertRaises(AttributeError,
                              lambda: bundle.specimens[0].organ_part)

        # Prove that as_json returns a valid JSON structure (no cycles, correct types, etc.)
        self.assertTrue(
            isinstance(json.dumps(as_json(bundle), default=str), str))

        library_prep_protos = [
            p for p in bundle.protocols.values()
            if isinstance(p, LibraryPreparationProtocol)
        ]
        library_prep_proto_types = {type(p) for p in library_prep_protos}
        has_library_preps = library_construction_methods != set() or len(
            library_prep_protos) > 0
        self.assertEqual(
            {LibraryPreparationProtocol} if has_library_preps else set(),
            library_prep_proto_types)
        self.assertEqual(
            library_construction_methods,
            {p.library_construction_method
             for p in library_prep_protos})
        # noinspection PyDeprecation
        self.assertEqual(
            library_construction_methods,
            {p.library_construction_approach
             for p in library_prep_protos})

        if slice_thickness is not None:
            self.assertEqual(slice_thickness, [
                s.slice_thickness for s in bundle.entities.values()
                if isinstance(s, ImagedSpecimen)
            ])

        if content_description is not None:
            self.assertSetEqual(
                content_description,
                set(
                    chain.from_iterable(file.content_description
                                        for file in bundle.files.values())))

        return bundle
コード例 #19
0
ファイル: can_bundle.py プロジェクト: NoopDog/azul
def main(argv):
    parser = argparse.ArgumentParser(description=__doc__)
    parser.add_argument(
        '--dss-url',
        '-u',
        default=config.dss_endpoint,
        help=
        'The URL of the DSS REST API endpoint from which to download the bundle to be canned '
        '(default: %(default)s).')
    parser.add_argument(
        '--replica',
        '-r',
        default='aws',
        help=
        "The replica from which to donwload the bundle to be canned (default: %(default)s)."
    )
    parser.add_argument('--uuid',
                        '-b',
                        required=True,
                        help='The UUID of the bundle to can.')
    parser.add_argument(
        '--version',
        '-v',
        help='The version of the bundle to can  (default: the latest version).'
    )
    parser.add_argument(
        '--output-dir',
        '-O',
        default=os.path.join(config.project_root, 'test', 'indexer', 'data'),
        help='The path to the output directory (default: %(default)s).')
    parser.add_argument(
        '--api-json',
        '-A',
        default=False,
        action='store_true',
        help=
        "Dump the return value of metadata-api's as_json function (default off)."
    )
    args = parser.parse_args(argv)

    dss_client = azul.dss.direct_access_client(
        dss_endpoint=args.dss_url, num_workers=config.num_dss_workers)
    version, manifest, metadata_files = download_bundle_metadata(
        client=dss_client,
        replica=args.replica,
        uuid=args.uuid,
        version=args.version,
        num_workers=config.num_dss_workers)
    logger.info('Downloaded bundle %s version %s from replica %s.', args.uuid,
                version, args.replica)

    api_json = as_json(Bundle(args.uuid, version, manifest,
                              metadata_files)) if args.api_json else None

    for obj, suffix in [(manifest, ".manifest.json"),
                        (metadata_files, '.metadata.json'),
                        *([(api_json, ".api.json")] if api_json else [])]:
        path = os.path.join(args.output_dir, args.uuid + suffix)
        with write_file_atomically(path) as f:
            json.dump(obj, f, indent=4)
        logger.info("Successfully wrote %s", path)