Exemple #1
0
class SpreadsheetImport(TestCase):
    def setUp(self):
        self.test_data_path = os.path.dirname(os.path.realpath(__file__))
        self.configure_ingest_client()

    def configure_ingest_client(self):
        gcp_credentials_file = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')
        self.s2s_token_client = S2STokenClient(
            ServiceCredential.from_file(gcp_credentials_file),
            INGEST_API_JWT_AUDIENCE)
        self.token_manager = TokenManager(self.s2s_token_client)
        self.ingest_api = IngestApi(url=INGEST_API,
                                    token_manager=self.token_manager)

    def test_spreadsheet_import(self):
        self.metadata_spreadsheet_path = os.path.join(self.test_data_path,
                                                      SPREADSHEET_FILE)
        download_file(SPREADSHEET_LOCATION, self.metadata_spreadsheet_path)
        importer = XlsImporter(self.ingest_api)
        submission_resource = self.ingest_api.create_submission()

        submission_url = submission_resource["_links"]["self"]["href"].rsplit(
            "{")[0]
        submission, _ = importer.import_file(self.metadata_spreadsheet_path,
                                             submission_url, False)

        entities_by_type = {}

        for entity in submission.get_entities():
            entity_type = entity.type
            if not entities_by_type.get(entity_type):
                entities_by_type[entity_type] = []
            entities_by_type[entity_type].append(entity)

        files = list(self.ingest_api.get_entities(submission_url, 'files'))
        biomaterials = list(
            self.ingest_api.get_entities(submission_url, 'biomaterials'))
        protocols = list(
            self.ingest_api.get_entities(submission_url, 'protocols'))
        processes = list(
            self.ingest_api.get_entities(submission_url, 'processes'))

        self.assertEquals(len(files), len(entities_by_type['file']))
        self.assertEquals(len(biomaterials),
                          len(entities_by_type['biomaterial']))
        self.assertEquals(len(protocols), len(entities_by_type['protocol']))
        self.assertEquals(len(processes), len(entities_by_type['process']))

    def tearDown(self) -> None:
        if self.metadata_spreadsheet_path:
            delete_file(self.metadata_spreadsheet_path)
Exemple #2
0
class TestIngest(unittest.TestCase):

    def setUp(self):
        self.deployment = os.environ.get('DEPLOYMENT_ENV', None)

        if self.deployment not in DEPLOYMENTS:
            raise RuntimeError(f'DEPLOYMENT_ENV environment variable must be one of {DEPLOYMENTS}')

        self.ingest_client_api = IngestApi(url=f"https://api.ingest.{self.deployment}.data.humancellatlas.org")
        self.s2s_token_client = S2STokenClient()
        gcp_credentials_file = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')
        self.s2s_token_client.setup_from_file(gcp_credentials_file)
        self.token_manager = TokenManager(self.s2s_token_client)
        self.ingest_broker = IngestUIAgent(self.deployment)
        self.ingest_api = IngestApiAgent(deployment=self.deployment)

    def ingest_and_upload_only(self, dataset_name):
        dataset_fixture = DatasetFixture(dataset_name, self.deployment)
        runner = DatasetRunner(self.deployment)
        runner.valid_run(dataset_fixture)
        return runner

    def ingest(self, dataset_name):
        dataset_fixture = DatasetFixture(dataset_name, self.deployment)
        runner = DatasetRunner(self.deployment)
        runner.complete_run(dataset_fixture)
        return runner

    def _create_submission_envelope(self):
        token = self.token_manager.get_token()
        self.ingest_client_api.set_token(f'Bearer {token}')
        submission = self.ingest_client_api.create_submission()
        submission_url = submission["_links"]["self"]["href"]
        submission_envelope = self.ingest_api.envelope(envelope_id=None, url=submission_url)
        return submission_envelope

    # TODO move this to ingest client api
    def _get_entities(self, url, entity_type):
        r = requests.get(url, headers={'Content-type': 'application/json'})
        r.raise_for_status()
        response = r.json()

        if response.get('_embedded') and response['_embedded'].get(entity_type):
            return response['_embedded'][entity_type]
        else:
            return []

    def ingest_analysis(self, dataset_name):
        analysis_fixture = AnalysisSubmissionFixture()
        runner = AnalysisSubmissionRunner(self.deployment, self.ingest_broker, self.ingest_api, self.token_manager,
                                          self.ingest_client_api)
        dataset_fixture = DatasetFixture(dataset_name, self.deployment)
        runner.run(dataset_fixture, analysis_fixture)

        self.assertTrue(runner.bundle_manifest_uuid,
                        'The analysis process should be attached to an input bundle manifest')

        derived_files_url = runner.analysis_process['_links']['derivedFiles'][
            'href']
        derived_files = self._get_entities(derived_files_url, 'files')
        analysis_files = runner.analysis_submission.get_files()

        derived_file_uuids = [file['uuid']['uuid'] for file in derived_files]
        analysis_file_uuids = [file['uuid']['uuid'] for file in analysis_files]

        self.assertTrue(derived_file_uuids, 'There must be files in the analysis submission')

        self.assertEqual(derived_file_uuids, analysis_file_uuids,
                         'The analyses files must be linked to the analyses process.')

        input_files_url = runner.analysis_process['_links']['inputFiles'][
            'href']
        input_files = self._get_entities(input_files_url, 'files')
        primary_submission_files = runner.primary_submission.get_files()

        input_file_uuids = [file['uuid']['uuid'] for file in input_files]
        primary_submission_file_uuids = [file['uuid']['uuid'] for file in primary_submission_files]

        self.assertTrue(input_file_uuids, 'There must be files from the primary submission')
        self.assertEqual(input_file_uuids, primary_submission_file_uuids,
                         'The primary submission files must be linked to the analyses process.')

        input_bundle_manifest_url = \
            runner.analysis_process['_links']['inputBundleManifests']['href']
        attached_bundle_manifests = self._get_entities(
            input_bundle_manifest_url, 'bundleManifests')

        self.assertEqual(len(attached_bundle_manifests), 1,
                         'There should only be one input bundle manifest for the analyses process')
        self.assertEqual(attached_bundle_manifests[0]['bundleUuid'],
                         runner.bundle_manifest_uuid,
                         'The input bundle manifest for the analyses process is incorrect')

        return runner

    def ingest_big_submission(self):
        metadata_fixture = MetadataFixture()
        runner = BigSubmissionRunner(self.deployment, self.ingest_client_api, self.token_manager)
        runner.run(metadata_fixture)

    def ingest_updates(self):
        runner = UpdateSubmissionRunner(self.deployment, self.ingest_broker, self.ingest_api, self.ingest_client_api)
        runner.run()

        self.assertEqual(len(runner.updated_bundle_fqids), 1, "There should be 1 bundle updated.")
Exemple #3
0
class IngestApiAgent:
    def __init__(self, deployment):
        self.deployment = deployment
        self.ingest_api_url = self._ingest_api_url()
        self.ingest_auth_agent = IngestAuthAgent()
        self._set_up_ingest_client()

    def _set_up_ingest_client(self):
        self.ingest_api = IngestApi(url=self.ingest_api_url)
        auth_header = self.ingest_auth_agent.make_auth_header()
        self.ingest_api.set_token(auth_header['Authorization'])

    def project(self, project_id):
        return IngestApiAgent.Project(project_id=project_id,
                                      ingest_api_agent=self)

    def submission(self, submission_id):
        return IngestApiAgent.SubmissionEnvelope(envelope_id=submission_id,
                                                 ingest_api_agent=self)

    def new_submission(self, is_update=False):
        submission_data = self.ingest_api.create_submission(
            update_submission=is_update)
        return IngestApiAgent.SubmissionEnvelope(ingest_api_agent=self,
                                                 data=submission_data)

    def iter_submissions(self):
        for page in self.iter_pages('/submissionEnvelopes', page_size=500):
            for submission_data in page['submissionEnvelopes']:
                yield IngestApiAgent.SubmissionEnvelope(data=submission_data,
                                                        ingest_api_agent=self)

    """
    Get a collection resource.
    Iterates through all pages gathering results and returns a list.
    """

    def get_all(self, path_or_url, result_element_we_are_interested_in):
        results = []
        for page in self.iter_pages(path_or_url):
            results += page[result_element_we_are_interested_in]
        return results

    """
    Iterate through a collection using HATEOAS pagination, yielding pages.
    """

    def iter_pages(self, path_or_url, page_size=100):
        path_or_url += f"?size={page_size}"

        while True:
            data = self.get(path_or_url)
            if '_embedded' not in data:
                break

            yield data['_embedded']

            if 'next' in data['_links']:
                path_or_url = data['_links']['next']['href']
            else:
                break

    """
    Get a singleton resource.
    """

    def get(self, path_or_url):
        if path_or_url.startswith('http'):
            url = path_or_url
        else:
            url = f"{self.ingest_api_url}{path_or_url}"

        response = requests.get(
            url, headers=self.ingest_auth_agent.make_auth_header())

        if response.ok:
            return response.json()
        else:
            raise RuntimeError(f"GET {url} got {response}")

    def post(self, url, content, params={}):
        auth_header = self.ingest_auth_agent.make_auth_header()
        response = requests.post(url,
                                 json=content,
                                 headers=auth_header,
                                 params=params)
        response.raise_for_status()
        return response.json()

    def put(self, url, content=None):
        auth_header = self.ingest_auth_agent.make_auth_header()
        if content:
            response = requests.put(url, json=content, headers=auth_header)
        else:
            response = requests.put(url, headers=auth_header)
        response.raise_for_status()
        return response.json()

    def _ingest_api_url(self):
        if self.deployment == 'prod':
            return "https://api.ingest.data.humancellatlas.org"
        else:
            return f"https://api.ingest.{self.deployment}.data.humancellatlas.org"

    class Project:
        def __init__(self, project_id, ingest_api_agent):
            self.project_id = project_id
            self.api = ingest_api_agent
            self.data = None
            self._load()

        @property
        def uuid(self):
            return self.data['uuid']

        def submission_envelopes(self):
            data = self.api.get(
                self.data['_links']['submissionEnvelopes']['href'])
            return [
                IngestApiAgent.SubmissionEnvelope(data=subm_data, ingest_api_agent=self.api) \
                for subm_data in data['_embedded']['submissionEnvelopes']
            ]

        def _load(self):
            self.data = self.api.get(f"/projects/{self.project_id}")

    class SubmissionEnvelope:

        # May be primed wih data, or of you supply an ID, we will go get the data
        def __init__(self, ingest_api_agent, envelope_id=None, data=None):
            if not envelope_id and not data:
                raise RuntimeError(
                    "either envelope_id or data must be provided")
            self.api = ingest_api_agent
            self.data = None
            if envelope_id:
                self.envelope_id = envelope_id
                self._load()
            else:
                self.data = data
                self.envelope_id = data['_links']['self']['href'].split(
                    '/')[-1]

        def __str__(self):
            return f"SubmissionEnvelope(id={self.envelope_id}, uuid={self.uuid}, " \
                f"status={self.status})"

        def _link_to(self, endpoint_path):
            return self.data['_links'][endpoint_path]['href']

        def files(self):
            return self.api.get_all(self.data['_links']['files']['href'],
                                    'files')

        def metadata_documents(self, metadata_type: str = None):
            self._check_metadata_type(metadata_type)
            result_type = _pluralized_type[metadata_type]
            metadata_link = self._link_to(result_type)
            return self.api.get_all(metadata_link, result_type)

        def add_biomaterial(self,
                            biomaterial_content,
                            update_target_uuid: str = None):
            return self._add_metadata('biomaterial',
                                      biomaterial_content,
                                      update_target_uuid=update_target_uuid)

        def _add_metadata(self,
                          metadata_type,
                          metadata_content,
                          update_target_uuid: str = None):
            self._check_metadata_type(metadata_type)
            endpoint_path = _pluralized_type[metadata_type]
            metadata_link = self._link_to(endpoint_path)
            params = {
                'updatingUuid': update_target_uuid
            } if update_target_uuid else {}
            return self.api.post(metadata_link,
                                 metadata_content,
                                 params=params)

        @staticmethod
        def _check_metadata_type(metadata_type):
            if not metadata_type:
                raise RuntimeError('`metadata_type` must be specified')
            if not metadata_type in _pluralized_type:
                raise KeyError(f'Unknown metadata type [{metadata_type}].')

        def iter_files(self):
            url = self.data['_links']['files']['href']
            for page in self.api.iter_pages(url):
                for file in page['files']:
                    yield file

        def reload(self):
            self._load()
            return self

        def check_validation(self):
            self._load()
            if self.status == 'Invalid':
                raise Exception("envelope status is Invalid")
            return self.status in ['Valid', 'Submitted']

        def check_status(self):
            self._load()
            return self.status

        @property
        def status(self):
            return self.data['submissionState']

        @property
        def uuid(self):
            return self.data['uuid']['uuid']

        def upload_credentials(self):
            """ Return upload area credentials or None if this envelope doesn't have an upload area yet """
            staging_details = self.data.get('stagingDetails', None)
            if staging_details and 'stagingAreaLocation' in staging_details:
                return staging_details.get('stagingAreaLocation',
                                           {}).get('value', None)
            return None

        def bundles(self):
            url = self.data['_links']['bundleManifests']['href']
            manifests = self.api.get_all(url, 'bundleManifests')
            return [manifest['bundleUuid'] for manifest in manifests]

        def complete(self):
            completion_endpoint = self._link_to('submit')
            self.api.put(completion_endpoint)

        def _load(self):
            self.data = self.api.get(
                f"/submissionEnvelopes/{self.envelope_id}")