Beispiel #1
0
class SpreadsheetImport(TestCase):
    def setUp(self):
        self.test_data_path = os.path.dirname(os.path.realpath(__file__))
        self.configure_ingest_client()

    def configure_ingest_client(self):
        gcp_credentials_file = os.environ.get('GOOGLE_APPLICATION_CREDENTIALS')
        self.s2s_token_client = S2STokenClient(
            ServiceCredential.from_file(gcp_credentials_file),
            INGEST_API_JWT_AUDIENCE)
        self.token_manager = TokenManager(self.s2s_token_client)
        self.ingest_api = IngestApi(url=INGEST_API,
                                    token_manager=self.token_manager)

    def test_spreadsheet_import(self):
        self.metadata_spreadsheet_path = os.path.join(self.test_data_path,
                                                      SPREADSHEET_FILE)
        download_file(SPREADSHEET_LOCATION, self.metadata_spreadsheet_path)
        importer = XlsImporter(self.ingest_api)
        submission_resource = self.ingest_api.create_submission()

        submission_url = submission_resource["_links"]["self"]["href"].rsplit(
            "{")[0]
        submission, _ = importer.import_file(self.metadata_spreadsheet_path,
                                             submission_url, False)

        entities_by_type = {}

        for entity in submission.get_entities():
            entity_type = entity.type
            if not entities_by_type.get(entity_type):
                entities_by_type[entity_type] = []
            entities_by_type[entity_type].append(entity)

        files = list(self.ingest_api.get_entities(submission_url, 'files'))
        biomaterials = list(
            self.ingest_api.get_entities(submission_url, 'biomaterials'))
        protocols = list(
            self.ingest_api.get_entities(submission_url, 'protocols'))
        processes = list(
            self.ingest_api.get_entities(submission_url, 'processes'))

        self.assertEquals(len(files), len(entities_by_type['file']))
        self.assertEquals(len(biomaterials),
                          len(entities_by_type['biomaterial']))
        self.assertEquals(len(protocols), len(entities_by_type['protocol']))
        self.assertEquals(len(processes), len(entities_by_type['process']))

    def tearDown(self) -> None:
        if self.metadata_spreadsheet_path:
            delete_file(self.metadata_spreadsheet_path)
Beispiel #2
0
class IngestHydrator(Hydrator):
    """
    DCP Ingest Service Submission hydrator class.

    Enables importing of HCA Ingest Service submissions by specifying a Submission ID.
    """
    def __init__(self, graph, submission_uuid):
        super().__init__(graph)

        self._logger.info(
            f"Started ingest hydrator for for submission [{submission_uuid}]")

        self._ingest_api = IngestApi(Config['INGEST_API'])

        project_url = self._ingest_api.get_submission_by_uuid(
            submission_uuid)['_links']['relatedProjects']['href']
        project = self._ingest_api.get(
            project_url).json()['_embedded']['projects'][0]

        self._logger.info(
            f"Found project for submission {project['uuid']['uuid']}")

        self._entities = {}
        for submission in self.fetch_submissions_in_project(project):
            self._logger.info(
                f"Found submission for project with uuid {submission['uuid']['uuid']}"
            )
            for entity in self.build_entities_from_submission(submission):
                self._entities[entity['uuid']] = entity

        self._nodes = self.get_nodes()
        self._edges = self.get_edges()

    def fetch_submissions_in_project(self, project: dict) -> [dict]:
        self._logger.debug(
            f"Fetching submissions for project {project['uuid']['uuid']}")
        return self._ingest_api.get(
            project['_links']['submissionEnvelopes']
            ['href']).json()['_embedded']['submissionEnvelopes']

    def build_entities_from_submission(self, submission: dict):
        id_field_map = {
            'biomaterials': "biomaterial_core.biomaterial_id",
            'files': "file_core.file_name",
            'processes': "process_core.process_id",
            'projects': "project_core.project_short_name",
            'protocols': "protocol_core.protocol_id",
        }

        for entity_type in [
                "biomaterials", "files", "processes", "projects", "protocols"
        ]:
            for entity in self._ingest_api.get_entities(
                    submission['_links']['self']['href'], entity_type):
                properties = flatten(entity['content'])

                new_entity = {
                    'properties': properties,
                    'labels': [entity['type'].lower()],
                    'node_id': properties[id_field_map[entity_type]],
                    'links': entity['_links'],
                    'uuid': entity['uuid']['uuid'],
                }

                concrete_type = new_entity['properties']['describedBy'].rsplit(
                    '/', 1)[1]
                new_entity['labels'].append(concrete_type)

                yield new_entity

    @benchmark
    def get_nodes(self):
        self._logger.debug("importing nodes")

        nodes = {}

        for entity_uuid, entity in self._entities.items():
            node_id = entity['node_id']
            nodes[entity_uuid] = Node(
                *entity['labels'],
                **entity['properties'],
                uuid=entity['uuid'],
                self_link=entity['links']['self']['href'],
                id=node_id)

            self._logger.debug(f"({node_id})")

        self._logger.info(f"imported {len(nodes)} nodes")

        return nodes

    @benchmark
    def get_edges(self):
        self._logger.debug("importing edges")

        edges = []
        relationship_map = {
            'projects': "projects",
            'protocols': "protocols",
            'inputToProcesses': "processes",
            'derivedByProcesses': "processes",
            'inputBiomaterials': "biomaterials",
            'derivedBiomaterials': "biomaterials",
            'supplementaryFiles': "files",
            'inputFiles': "files",
            'derivedFiles': "files",
        }

        for entity_uuid, entity in self._entities.items():
            for relationship_type in relationship_map.keys():
                if relationship_type in entity['links']:
                    relationships = self._ingest_api.get_all(
                        entity['links'][relationship_type]['href'],
                        relationship_map[relationship_type])

                    for end_entity in relationships:
                        start_node = self._nodes[entity_uuid]
                        relationship_name = convert_to_macrocase(
                            relationship_type)
                        try:
                            end_node = self._nodes[end_entity['uuid']['uuid']]
                            edges.append(
                                Relationship(start_node, relationship_name,
                                             end_node))

                            # Adding additional relationships to the graphs.
                            if relationship_name == 'INPUT_TO_PROCESSES':
                                edges.append(
                                    Relationship(start_node,
                                                 'DUMMY_EXPERIMENTAL_DESIGN',
                                                 end_node))
                            if relationship_name == 'DERIVED_BY_PROCESSES':
                                edges.append(
                                    Relationship(end_node,
                                                 'DUMMY_EXPERIMENTAL_DESIGN',
                                                 start_node))

                            self._logger.debug(
                                f"({start_node['id']})-[:{relationship_name}]->({end_node['id']})"
                            )
                        except KeyError:
                            self._logger.debug(
                                f"Missing end node at a [{start_node['id']}] entity."
                            )

        self._logger.info(f"imported {len(edges)} edges")

        return edges