def _download_data(self):
        """Download Oncotree source data for loading into normalizer."""
        logger.info('Downloading OncoTree...')
        # get version for latest stable release
        versions_url = f"{self._SRC_API_ROOT}versions"
        versions = json.loads(requests.get(versions_url).text)
        latest = [
            v['release_date'] for v in versions
            if v['api_identifier'] == 'oncotree_latest_stable'
        ][0]
        version = latest.replace('-', '_')

        # download data
        url = f'{self._SRC_API_ROOT}tumorTypes/tree?version=oncotree_{version}'
        try:
            response = requests.get(url, stream=True)
        except requests.exceptions.RequestException as e:
            logger.error(f'OncoTree download failed: {e}')
            raise e
        filename = self._data_path / f'oncotree_{version}.json'
        handle = open(filename, 'wb')
        for chunk in response.iter_content(chunk_size=512):
            if chunk:
                handle.write(chunk)
        self._version = version
        logger.info('Finished downloading OncoTree')
    def update_record(self,
                      concept_id: str,
                      field: str,
                      new_value: Any,
                      item_type: str = 'identity'):
        """Update the field of an individual record to a new value.

        :param str concept_id: record to update
        :param str field: name of field to update
        :param str new_value: new value
        :param str item_type: record type, one of {'identity', 'merger'}
        """
        key = {
            'label_and_type': f'{concept_id.lower()}##{item_type}',
            'concept_id': concept_id
        }
        update_expression = f"set {field}=:r"
        update_values = {':r': new_value}
        try:
            self.diseases.update_item(Key=key,
                                      UpdateExpression=update_expression,
                                      ExpressionAttributeValues=update_values)
        except ClientError as e:
            logger.error(f"boto3 client error in `database.update_record()`: "
                         f"{e.response['Error']['Message']}")
Ejemplo n.º 3
0
    def _generate_merged_record(self, record_id_set: Set[str]) -> (Dict, List):
        """Generate merged record from provided concept ID group.
        Where attributes are sets, they should be merged, and where they are
        scalars, assign from the highest-priority source where that attribute
        is non-null.

        Priority is NCIt > Mondo > OncoTree> DO.

        :param Set record_id_set: group of concept IDs
        :return: completed merged drug object to be stored in DB, as well as
            a list of the IDs ultimately included in said record
        """
        records = []
        final_ids = []
        for record_id in record_id_set:
            record = self._database.get_record_by_id(record_id)
            if record:
                records.append(record)
                final_ids.append(record['concept_id'])
            else:
                logger.error(f"generate_merged_record could not retrieve "
                             f"record for {record_id} in {record_id_set}")

        def record_order(record):
            """Provide priority values of concepts for sort function."""
            src = record['src_name'].upper()
            source_rank = SourcePriority[src].value
            return source_rank, record['concept_id']
        records.sort(key=record_order)

        merged_properties = {
            'concept_id': records[0]['concept_id'],
            'aliases': set(),
            'associated_with': set()
        }
        if len(records) > 1:
            merged_properties['xrefs'] = [r['concept_id'] for r
                                          in records[1:]]

        set_fields = ['aliases', 'associated_with']
        scalar_fields = ['label', 'pediatric_disease']
        for record in records:
            for field in set_fields:
                if field in record:
                    merged_properties[field] |= set(record[field])
            for field in scalar_fields:
                if field not in merged_properties and field in record:
                    merged_properties[field] = record[field]

        for field in set_fields:
            field_value = merged_properties[field]
            if field_value:
                merged_properties[field] = list(field_value)
            else:
                del merged_properties[field]

        merged_properties['label_and_type'] = \
            f'{merged_properties["concept_id"].lower()}##merger'
        return merged_properties, final_ids
Ejemplo n.º 4
0
 def _download_data(self):
     """Download Mondo thesaurus source file for loading into normalizer."""
     logger.info('Downloading Mondo data...')
     try:
         response = requests.get(self._SRC_URL, stream=True)
     except requests.exceptions.RequestException as e:
         logger.error(f'Mondo download failed: {e}')
         raise e
     handle = open(self._data_path / f'mondo_{self._version}.owl', "wb")
     for chunk in response.iter_content(chunk_size=512):
         if chunk:
             handle.write(chunk)
     logger.info('Finished downloading Mondo Disease Ontology')
Ejemplo n.º 5
0
    def _transform_data(self):
        """Gather and transform disease entities."""
        mondo = owl.get_ontology(self._data_file.absolute().as_uri()).load()

        # gather constants/search materials
        disease_root = "http://purl.obolibrary.org/obo/MONDO_0000001"
        disease_uris = self._get_subclasses(disease_root)
        peds_neoplasm_root = "http://purl.obolibrary.org/obo/MONDO_0006517"
        peds_uris = self._get_subclasses(peds_neoplasm_root)

        for uri in disease_uris:
            try:
                disease = mondo.search(iri=uri)[0]
            except TypeError:
                logger.error(f"Mondo.transform_data could not retrieve class "
                             f"for URI {uri}")
                continue
            try:
                label = disease.label[0]
            except IndexError:
                logger.debug(f"No label for Mondo concept {uri}")
                continue

            aliases = list({d for d in disease.hasExactSynonym if d != label})
            params = {
                'concept_id': disease.id[0].lower(),
                'label': label,
                'aliases': aliases,
                'xrefs': [],
                'associated_with': [],
            }

            for ref in disease.hasDbXref:
                prefix, id_no = ref.split(':', 1)
                normed_prefix = MONDO_PREFIX_LOOKUP.get(prefix, None)
                if not normed_prefix:
                    continue
                xref = f'{normed_prefix}:{id_no}'

                if normed_prefix.lower() in PREFIX_LOOKUP:
                    params['xrefs'].append(xref)
                elif normed_prefix == NamespacePrefix.KEGG:
                    xref = f'{normed_prefix}:H{id_no}'
                    params['associated_with'].append(xref)
                else:
                    params['associated_with'].append(xref)

            if disease.iri in peds_uris:
                params['pediatric_disease'] = True

            self._load_disease(params)
    def add_record(self, record: Dict, record_type: str = "identity"):
        """Add new record to database.

        :param Dict record: record to upload
        :param str record_type: type of record (either 'identity' or 'merger')
        """
        id_prefix = record['concept_id'].split(':')[0].lower()
        record['src_name'] = PREFIX_LOOKUP[id_prefix]
        label_and_type = f'{record["concept_id"].lower()}##{record_type}'
        record['label_and_type'] = label_and_type
        record['item_type'] = record_type
        try:
            self.batch.put_item(Item=record)
        except ClientError as e:
            logger.error("boto3 client error on add_record for "
                         f"{record['concept_id']}: "
                         f"{e.response['Error']['Message']}")
    def get_records_by_type(self, query: str, match_type: str) -> List[Dict]:
        """Retrieve records for given query and match type.

        :param query: string to match against
        :param str match_type: type of match to look for. Should be one
            of {"label", "alias", "xref", "associated_with"} (use
            `get_record_by_id` for concept ID lookup)
        :return: list of matching records. Empty if lookup fails.
        """
        pk = f'{query}##{match_type.lower()}'
        filter_exp = Key('label_and_type').eq(pk)
        try:
            matches = self.diseases.query(KeyConditionExpression=filter_exp)
            return matches.get('Items', None)
        except ClientError as e:
            logger.error(f"boto3 client error on get_records_by_type for "
                         f"search term {query}: "
                         f"{e.response['Error']['Message']}")
            return []
Ejemplo n.º 8
0
    def perform_etl(self) -> List[str]:
        """Public-facing method to initiate ETL procedures on given data.

        :return: empty list (because OMIM IDs shouldn't be used to construct
            merged concept groups)
        """
        try:
            self._extract_data()
        except DownloadException:
            logger.error("OMIM data extraction failed: input file must be "
                         "manually placed in data directory.")
            raise DownloadException(f"Could not access OMIM data - see README "
                                    f"for details. Input data must be "
                                    f"manually placed in "
                                    f"{self._data_path.absolute().as_uri()}")
        self._load_meta()
        self._transform_data()
        self.database.flush_batch()
        return []
    def add_ref_record(self, term: str, concept_id: str, ref_type: str):
        """Add auxiliary/reference record to database.

        :param str term: referent term
        :param str concept_id: concept ID to refer to
        :param str ref_type: one of {'alias', 'label', 'xref',
            'associated_with'}
        """
        label_and_type = f'{term.lower()}##{ref_type}'
        src_name = PREFIX_LOOKUP[concept_id.split(':')[0].lower()]
        record = {
            'label_and_type': label_and_type,
            'concept_id': concept_id.lower(),
            'src_name': src_name,
            'item_type': ref_type,
        }
        try:
            self.batch.put_item(Item=record)
        except ClientError as e:
            logger.error(f"boto3 client error adding reference {term} for "
                         f"{concept_id} with match type {ref_type}: "
                         f"{e.response['Error']['Message']}")
    def get_record_by_id(self,
                         concept_id: str,
                         case_sensitive: bool = True,
                         merge: bool = False) -> Optional[Dict]:
        """Fetch record corresponding to provided concept ID

        :param str concept_id: concept ID for disease record
        :param bool case_sensitive: if true, performs exact lookup, which is
            more efficient. Otherwise, performs filter operation, which
            doesn't require correct casing.
        :param bool merge: if true, look for merged record; look for identity
            record otherwise.
        :return: complete disease record, if match is found; None otherwise
        """
        try:
            if merge:
                pk = f'{concept_id.lower()}##merger'
            else:
                pk = f'{concept_id.lower()}##identity'
            if case_sensitive:
                match = self.diseases.get_item(Key={
                    'label_and_type': pk,
                    'concept_id': concept_id
                })
                return match['Item']
            else:
                exp = Key('label_and_type').eq(pk)
                response = self.diseases.query(KeyConditionExpression=exp)
                return response['Items'][0]
        except ClientError as e:
            logger.error(f"boto3 client error on get_records_by_id for "
                         f"search term {concept_id}: "
                         f"{e.response['Error']['Message']}")
            return None
        except KeyError:  # record doesn't exist
            return None
        except IndexError:  # record doesn't exist
            return None
Ejemplo n.º 11
0
 def _download_data(self):
     """Download NCI thesaurus source file for loading into normalizer."""
     logger.info('Downloading NCI Thesaurus...')
     url = self._SRC_DIR + self._SRC_FNAME
     zip_path = self._data_path / 'ncit.zip'
     try:
         response = requests.get(url, stream=True)
     except requests.exceptions.RequestException as e:
         logger.error(f'NCIt download failed: {e}')
         raise e
     handle = open(zip_path, "wb")
     for chunk in response.iter_content(chunk_size=512):
         if chunk:
             handle.write(chunk)
     handle.close()
     with zipfile.ZipFile(zip_path, 'r') as zip_ref:
         zip_ref.extractall(self._data_path)
     remove(zip_path)
     version = self._SRC_DIR.split('/')[-2].split('_')[0]
     rename(self._data_path / 'Thesaurus.owl',
            self._data_path / f'ncit_{version}.owl')  # noqa: E501
     self._version = version
     logger.info('Finished downloading NCI Thesaurus')
Ejemplo n.º 12
0
    def create_merged_concepts(self, record_ids: List[str]):
        """Create concept groups, generate merged concept records, and
        update database.

        :param List[str] record_ids: concept identifiers from which groups
            should be generated.
        """
        # build groups
        logger.info(f'Generating record ID sets from {len(record_ids)} records')  # noqa E501
        start = timer()
        for concept_id in record_ids:
            try:
                record = self._database.get_record_by_id(concept_id)
            except AttributeError:
                logger.error(f"`create_merged_concepts` received invalid "
                             f"concept ID: {concept_id}")
                continue
            if not record:
                logger.error(f"generate_merged_concepts couldn't find "
                             f"{concept_id}")
                continue
            xrefs = record.get('xrefs', None)
            if xrefs:
                group = set(xrefs + [concept_id])
            else:
                group = {concept_id}
            self._groups.append((concept_id, group))
        end = timer()
        logger.debug(f'Built record ID sets in {end - start} seconds')

        # build merged concepts
        logger.info('Creating merged records and updating database...')
        start = timer()
        for record_id, group in self._groups:
            try:
                merged_record, merged_ids = self._generate_merged_record(group)
            except AttributeError:
                logger.error("`create_merged_concepts` received invalid group:"
                             f"{group} for concept {record_id}")
                continue
            self._database.add_record(merged_record, 'merger')
            merge_ref = merged_record['concept_id'].lower()

            for concept_id in merged_ids:
                self._database.update_record(concept_id, 'merge_ref',
                                             merge_ref)
        end = timer()
        logger.info("merged concept generation successful.")
        logger.debug(f'Generated and added concepts in {end - start} seconds)')