def _download_data(self): """Download Oncotree source data for loading into normalizer.""" logger.info('Downloading OncoTree...') # get version for latest stable release versions_url = f"{self._SRC_API_ROOT}versions" versions = json.loads(requests.get(versions_url).text) latest = [ v['release_date'] for v in versions if v['api_identifier'] == 'oncotree_latest_stable' ][0] version = latest.replace('-', '_') # download data url = f'{self._SRC_API_ROOT}tumorTypes/tree?version=oncotree_{version}' try: response = requests.get(url, stream=True) except requests.exceptions.RequestException as e: logger.error(f'OncoTree download failed: {e}') raise e filename = self._data_path / f'oncotree_{version}.json' handle = open(filename, 'wb') for chunk in response.iter_content(chunk_size=512): if chunk: handle.write(chunk) self._version = version logger.info('Finished downloading OncoTree')
def update_record(self, concept_id: str, field: str, new_value: Any, item_type: str = 'identity'): """Update the field of an individual record to a new value. :param str concept_id: record to update :param str field: name of field to update :param str new_value: new value :param str item_type: record type, one of {'identity', 'merger'} """ key = { 'label_and_type': f'{concept_id.lower()}##{item_type}', 'concept_id': concept_id } update_expression = f"set {field}=:r" update_values = {':r': new_value} try: self.diseases.update_item(Key=key, UpdateExpression=update_expression, ExpressionAttributeValues=update_values) except ClientError as e: logger.error(f"boto3 client error in `database.update_record()`: " f"{e.response['Error']['Message']}")
def _generate_merged_record(self, record_id_set: Set[str]) -> (Dict, List): """Generate merged record from provided concept ID group. Where attributes are sets, they should be merged, and where they are scalars, assign from the highest-priority source where that attribute is non-null. Priority is NCIt > Mondo > OncoTree> DO. :param Set record_id_set: group of concept IDs :return: completed merged drug object to be stored in DB, as well as a list of the IDs ultimately included in said record """ records = [] final_ids = [] for record_id in record_id_set: record = self._database.get_record_by_id(record_id) if record: records.append(record) final_ids.append(record['concept_id']) else: logger.error(f"generate_merged_record could not retrieve " f"record for {record_id} in {record_id_set}") def record_order(record): """Provide priority values of concepts for sort function.""" src = record['src_name'].upper() source_rank = SourcePriority[src].value return source_rank, record['concept_id'] records.sort(key=record_order) merged_properties = { 'concept_id': records[0]['concept_id'], 'aliases': set(), 'associated_with': set() } if len(records) > 1: merged_properties['xrefs'] = [r['concept_id'] for r in records[1:]] set_fields = ['aliases', 'associated_with'] scalar_fields = ['label', 'pediatric_disease'] for record in records: for field in set_fields: if field in record: merged_properties[field] |= set(record[field]) for field in scalar_fields: if field not in merged_properties and field in record: merged_properties[field] = record[field] for field in set_fields: field_value = merged_properties[field] if field_value: merged_properties[field] = list(field_value) else: del merged_properties[field] merged_properties['label_and_type'] = \ f'{merged_properties["concept_id"].lower()}##merger' return merged_properties, final_ids
def _download_data(self): """Download Mondo thesaurus source file for loading into normalizer.""" logger.info('Downloading Mondo data...') try: response = requests.get(self._SRC_URL, stream=True) except requests.exceptions.RequestException as e: logger.error(f'Mondo download failed: {e}') raise e handle = open(self._data_path / f'mondo_{self._version}.owl', "wb") for chunk in response.iter_content(chunk_size=512): if chunk: handle.write(chunk) logger.info('Finished downloading Mondo Disease Ontology')
def _transform_data(self): """Gather and transform disease entities.""" mondo = owl.get_ontology(self._data_file.absolute().as_uri()).load() # gather constants/search materials disease_root = "http://purl.obolibrary.org/obo/MONDO_0000001" disease_uris = self._get_subclasses(disease_root) peds_neoplasm_root = "http://purl.obolibrary.org/obo/MONDO_0006517" peds_uris = self._get_subclasses(peds_neoplasm_root) for uri in disease_uris: try: disease = mondo.search(iri=uri)[0] except TypeError: logger.error(f"Mondo.transform_data could not retrieve class " f"for URI {uri}") continue try: label = disease.label[0] except IndexError: logger.debug(f"No label for Mondo concept {uri}") continue aliases = list({d for d in disease.hasExactSynonym if d != label}) params = { 'concept_id': disease.id[0].lower(), 'label': label, 'aliases': aliases, 'xrefs': [], 'associated_with': [], } for ref in disease.hasDbXref: prefix, id_no = ref.split(':', 1) normed_prefix = MONDO_PREFIX_LOOKUP.get(prefix, None) if not normed_prefix: continue xref = f'{normed_prefix}:{id_no}' if normed_prefix.lower() in PREFIX_LOOKUP: params['xrefs'].append(xref) elif normed_prefix == NamespacePrefix.KEGG: xref = f'{normed_prefix}:H{id_no}' params['associated_with'].append(xref) else: params['associated_with'].append(xref) if disease.iri in peds_uris: params['pediatric_disease'] = True self._load_disease(params)
def add_record(self, record: Dict, record_type: str = "identity"): """Add new record to database. :param Dict record: record to upload :param str record_type: type of record (either 'identity' or 'merger') """ id_prefix = record['concept_id'].split(':')[0].lower() record['src_name'] = PREFIX_LOOKUP[id_prefix] label_and_type = f'{record["concept_id"].lower()}##{record_type}' record['label_and_type'] = label_and_type record['item_type'] = record_type try: self.batch.put_item(Item=record) except ClientError as e: logger.error("boto3 client error on add_record for " f"{record['concept_id']}: " f"{e.response['Error']['Message']}")
def get_records_by_type(self, query: str, match_type: str) -> List[Dict]: """Retrieve records for given query and match type. :param query: string to match against :param str match_type: type of match to look for. Should be one of {"label", "alias", "xref", "associated_with"} (use `get_record_by_id` for concept ID lookup) :return: list of matching records. Empty if lookup fails. """ pk = f'{query}##{match_type.lower()}' filter_exp = Key('label_and_type').eq(pk) try: matches = self.diseases.query(KeyConditionExpression=filter_exp) return matches.get('Items', None) except ClientError as e: logger.error(f"boto3 client error on get_records_by_type for " f"search term {query}: " f"{e.response['Error']['Message']}") return []
def perform_etl(self) -> List[str]: """Public-facing method to initiate ETL procedures on given data. :return: empty list (because OMIM IDs shouldn't be used to construct merged concept groups) """ try: self._extract_data() except DownloadException: logger.error("OMIM data extraction failed: input file must be " "manually placed in data directory.") raise DownloadException(f"Could not access OMIM data - see README " f"for details. Input data must be " f"manually placed in " f"{self._data_path.absolute().as_uri()}") self._load_meta() self._transform_data() self.database.flush_batch() return []
def add_ref_record(self, term: str, concept_id: str, ref_type: str): """Add auxiliary/reference record to database. :param str term: referent term :param str concept_id: concept ID to refer to :param str ref_type: one of {'alias', 'label', 'xref', 'associated_with'} """ label_and_type = f'{term.lower()}##{ref_type}' src_name = PREFIX_LOOKUP[concept_id.split(':')[0].lower()] record = { 'label_and_type': label_and_type, 'concept_id': concept_id.lower(), 'src_name': src_name, 'item_type': ref_type, } try: self.batch.put_item(Item=record) except ClientError as e: logger.error(f"boto3 client error adding reference {term} for " f"{concept_id} with match type {ref_type}: " f"{e.response['Error']['Message']}")
def get_record_by_id(self, concept_id: str, case_sensitive: bool = True, merge: bool = False) -> Optional[Dict]: """Fetch record corresponding to provided concept ID :param str concept_id: concept ID for disease record :param bool case_sensitive: if true, performs exact lookup, which is more efficient. Otherwise, performs filter operation, which doesn't require correct casing. :param bool merge: if true, look for merged record; look for identity record otherwise. :return: complete disease record, if match is found; None otherwise """ try: if merge: pk = f'{concept_id.lower()}##merger' else: pk = f'{concept_id.lower()}##identity' if case_sensitive: match = self.diseases.get_item(Key={ 'label_and_type': pk, 'concept_id': concept_id }) return match['Item'] else: exp = Key('label_and_type').eq(pk) response = self.diseases.query(KeyConditionExpression=exp) return response['Items'][0] except ClientError as e: logger.error(f"boto3 client error on get_records_by_id for " f"search term {concept_id}: " f"{e.response['Error']['Message']}") return None except KeyError: # record doesn't exist return None except IndexError: # record doesn't exist return None
def _download_data(self): """Download NCI thesaurus source file for loading into normalizer.""" logger.info('Downloading NCI Thesaurus...') url = self._SRC_DIR + self._SRC_FNAME zip_path = self._data_path / 'ncit.zip' try: response = requests.get(url, stream=True) except requests.exceptions.RequestException as e: logger.error(f'NCIt download failed: {e}') raise e handle = open(zip_path, "wb") for chunk in response.iter_content(chunk_size=512): if chunk: handle.write(chunk) handle.close() with zipfile.ZipFile(zip_path, 'r') as zip_ref: zip_ref.extractall(self._data_path) remove(zip_path) version = self._SRC_DIR.split('/')[-2].split('_')[0] rename(self._data_path / 'Thesaurus.owl', self._data_path / f'ncit_{version}.owl') # noqa: E501 self._version = version logger.info('Finished downloading NCI Thesaurus')
def create_merged_concepts(self, record_ids: List[str]): """Create concept groups, generate merged concept records, and update database. :param List[str] record_ids: concept identifiers from which groups should be generated. """ # build groups logger.info(f'Generating record ID sets from {len(record_ids)} records') # noqa E501 start = timer() for concept_id in record_ids: try: record = self._database.get_record_by_id(concept_id) except AttributeError: logger.error(f"`create_merged_concepts` received invalid " f"concept ID: {concept_id}") continue if not record: logger.error(f"generate_merged_concepts couldn't find " f"{concept_id}") continue xrefs = record.get('xrefs', None) if xrefs: group = set(xrefs + [concept_id]) else: group = {concept_id} self._groups.append((concept_id, group)) end = timer() logger.debug(f'Built record ID sets in {end - start} seconds') # build merged concepts logger.info('Creating merged records and updating database...') start = timer() for record_id, group in self._groups: try: merged_record, merged_ids = self._generate_merged_record(group) except AttributeError: logger.error("`create_merged_concepts` received invalid group:" f"{group} for concept {record_id}") continue self._database.add_record(merged_record, 'merger') merge_ref = merged_record['concept_id'].lower() for concept_id in merged_ids: self._database.update_record(concept_id, 'merge_ref', merge_ref) end = timer() logger.info("merged concept generation successful.") logger.debug(f'Generated and added concepts in {end - start} seconds)')