Example #1
0
 class DB:
     def __init__(self):
         self.db = Database()
         self.merge = Merge(database=self.db)
         if is_test_env:
             self.db.delete_all_db_tables()
             self.db.create_db_tables()
    def __init__(self, db_url: str = '', db_region: str = 'us-east-2'):
        """Initialize QueryHandler instance.

        :param str db_url: URL to database source.
        :param str db_region: AWS default region.
        """
        self.db = Database(db_url=db_url, region_name=db_region)
Example #3
0
    def update_normalizer_db(normalizer, prod, db_url, update_all,
                             update_merged):
        """Update selected normalizer source(s) in the gene database."""
        if prod:
            environ['GENE_NORM_PROD'] = "TRUE"
            db: Database = Database()
        else:
            if db_url:
                endpoint_url = db_url
            elif 'GENE_NORM_DB_URL' in environ.keys():
                endpoint_url = environ['GENE_NORM_DB_URL']
            else:
                endpoint_url = 'http://localhost:8000'
            db: Database = Database(db_url=endpoint_url)

        if update_all:
            normalizers = [src for src in SOURCES]
            CLI()._update_normalizers(normalizers, db, update_merged)
        elif not normalizer:
            CLI()._help_msg()
        else:
            normalizers = normalizer.lower().split()

            if len(normalizers) == 0:
                raise Exception("Must enter a normalizer")

            non_sources = set(normalizers) - {src for src in SOURCES}

            if len(non_sources) != 0:
                raise Exception(f"Not valid source(s): {non_sources}")

            CLI()._update_normalizers(normalizers, db, update_merged)
Example #4
0
def add_other_id_refs():
    """Add other_id reference for other_identifiers attribute."""
    db = Database()
    batch = db.genes.batch_writer()

    last_evaluated_key = None
    while True:
        if last_evaluated_key:
            response = db.genes.scan(ExclusiveStartKey=last_evaluated_key)
        else:
            response = db.genes.scan()
        last_evaluated_key = response.get('LastEvaluatedKey')

        records = response['Items']
        for record in records:
            if record['label_and_type'].endswith('##identity'):
                for other_id in record.get('other_identifiers', []):
                    batch.put_item(
                        Item={
                            'label_and_type': f"{other_id.lower()}##other_id",
                            'concept_id': record['concept_id'].lower(),
                            'src_name': record['src_name']
                        })

        if not last_evaluated_key:
            break
Example #5
0
    def update_normalizer_db(normalizer, prod, db_url, update_all,
                             update_merged):
        """Update selected normalizer source(s) in the gene database."""
        # Sometimes GENE_NORM_EB_PROD is accidentally set. We should verify that
        # it should actually be used in CLI
        if "GENE_NORM_EB_PROD" in environ:
            confirm_aws_db_use("PROD")

        if prod:
            environ['GENE_NORM_PROD'] = "TRUE"
            db: Database = Database()
        else:
            if db_url:
                endpoint_url = db_url
            elif 'GENE_NORM_DB_URL' in environ.keys():
                endpoint_url = environ['GENE_NORM_DB_URL']
            else:
                endpoint_url = 'http://localhost:8000'
            db: Database = Database(db_url=endpoint_url)

        if update_all:
            normalizers = [src for src in SOURCES]
            CLI()._update_normalizers(normalizers, db, update_merged)
        elif not normalizer:
            if update_merged:
                CLI()._load_merge(db, [])
            else:
                CLI()._help_msg()
        else:
            normalizers = normalizer.lower().split()

            if len(normalizers) == 0:
                raise Exception("Must enter a normalizer")

            non_sources = set(normalizers) - {src for src in SOURCES}

            if len(non_sources) != 0:
                raise Exception(f"Not valid source(s): {non_sources}")

            CLI()._update_normalizers(normalizers, db, update_merged)
class QueryHandler:
    """Class for normalizer management. Stores reference to database instance
    and normalizes query input.
    """
    def __init__(self, db_url: str = '', db_region: str = 'us-east-2'):
        """Initialize QueryHandler instance.

        :param str db_url: URL to database source.
        :param str db_region: AWS default region.
        """
        self.db = Database(db_url=db_url, region_name=db_region)

    @staticmethod
    def emit_warnings(query_str: str) -> List:
        """Emit warnings if query contains non breaking space characters.

        :param str query_str: query string
        :return: List of warnings
        """
        warnings = []
        nbsp = re.search('\xa0| ', query_str)
        if nbsp:
            warnings = [{
                "non_breaking_space_characters":
                "Query contains non-breaking space characters"
            }]
            logger.warning(
                f'Query ({query_str}) contains non-breaking space characters.')
        return warnings

    def fetch_meta(self, src_name: str) -> SourceMeta:
        """Fetch metadata for src_name.

        :param str src_name: name of source to get metadata for
        :return: SourceMeta object containing source metadata
        """
        if src_name in self.db.cached_sources.keys():
            return self.db.cached_sources[src_name]
        else:
            try:
                db_response = self.db.metadata.get_item(
                    Key={'src_name': src_name})
                response = SourceMeta(**db_response['Item'])
                self.db.cached_sources[src_name] = response
                return response
            except ClientError as e:
                logger.error(e.response['Error']['Message'])

    @staticmethod
    def _cast_location_ints(record: Dict) -> Dict:
        """Ensure Locations are formatted correctly -- interval start and end need to
        be recast to ints from how they're structured in DynamoDB

        :param Dict record: original record
        :return: record with corrected locations attributes, if applicable
        """
        if 'locations' in record:
            for loc in record['locations']:
                if loc['type'] == 'SequenceLocation':
                    loc['start']['value'] = int(loc['start']['value'])
                    loc['end']['value'] = int(loc['end']['value'])
        return record

    def add_record(self, response: Dict[str, Dict], item: Dict,
                   match_type: MatchType) -> (Dict, str):
        """Add individual record (i.e. Item in DynamoDB) to response object

        :param Dict[str, Dict] response: in-progress response object to return
            to client
        :param Dict item: Item retrieved from DynamoDB
        :param MatchType match_type: match type for query
        :return: Tuple containing updated response object, and string
            containing name of the source of the match
        """
        del item['label_and_type']
        # DynamoDB Numbers get converted to Decimal
        item = self._cast_location_ints(item)
        item["match_type"] = match_type
        gene = Gene(**item)
        src_name = item['src_name']

        matches = response['source_matches']
        if src_name not in matches.keys():
            pass
        elif matches[src_name] is None:
            matches[src_name] = {
                'records': [gene],
                'source_meta_': self.fetch_meta(src_name)
            }
        else:
            matches[src_name]['records'].append(gene)

        return response, src_name

    def fetch_record(self, response: Dict[str, Dict], concept_id: str,
                     match_type: MatchType) -> None:
        """Add fetched record to response

        :param Dict[str, Dict] response: in-progress response object to return
            to client.
        :param str concept_id: Concept id to fetch record for.
            Should be all lower-case.
        :param MatchType match_type: match type for record
        """
        try:
            pk = f'{concept_id}##identity'
            filter_exp = Key('label_and_type').eq(pk)
            result = self.db.genes.query(KeyConditionExpression=filter_exp)
            match = result['Items'][0]
            self.add_record(response, match, match_type)
        except ClientError as e:
            logger.error(e.response['Error']['Message'])

    def post_process_resp(self, resp: Dict) -> Dict:
        """Fill all empty source_matches slots with NO_MATCH results and
        sort source records by descending `match_type`.

        :param Dict resp: incoming response object
        :return: response object with empty source slots filled with
                NO_MATCH results and corresponding source metadata
        """
        for src_name in resp['source_matches'].keys():
            if resp['source_matches'][src_name] is None:
                resp['source_matches'][src_name] = {
                    'match_type': MatchType.NO_MATCH,
                    'records': [],
                    'source_meta_': self.fetch_meta(src_name)
                }
            else:
                records = resp['source_matches'][src_name]['records']
                if len(records) > 1:
                    records = sorted(records,
                                     key=lambda k: k.match_type,
                                     reverse=True)
        return resp

    def response_keyed(self, query: str, sources: Set[str]) -> Dict:
        """Return response as dict where key is source name and value
        is a list of records. Corresponds to `keyed=true` API parameter.

        :param str query: string to match against
        :param Set[str] sources: sources to match from
        :return: completed response object to return to client
        """
        resp = {
            'query': query,
            'warnings': self.emit_warnings(query),
            'source_matches': {source: None
                               for source in sources}
        }
        if query == '':
            return self.post_process_resp(resp)
        query_l = query.lower()

        queries = list()
        if [p for p in PREFIX_LOOKUP.keys() if query_l.startswith(p)]:
            pk = f'{query_l}##identity'
            queries.append(pk)

        for prefix in [
                p for p in NAMESPACE_LOOKUP.keys() if query_l.startswith(p)
        ]:
            pk = f'{NAMESPACE_LOOKUP[prefix].lower()}:{query_l}##identity'
            queries.append(pk)

        for match in ITEM_TYPES.values():
            pk = f'{query_l}##{match}'
            queries.append(pk)

        matched_concept_ids = list()
        for q in queries:
            try:
                query_resp = self.db.genes.query(
                    KeyConditionExpression=Key('label_and_type').eq(q))
                for record in query_resp['Items']:
                    concept_id = record['concept_id']
                    if concept_id in matched_concept_ids:
                        continue
                    else:
                        if record['item_type'] == "identity":
                            self.add_record(resp, record, MatchType.CONCEPT_ID)
                        else:
                            self.fetch_record(
                                resp, concept_id,
                                MatchType[record['item_type'].upper()])
                        matched_concept_ids.append(concept_id)

            except ClientError as e:
                logger.error(e.response['Error']['Message'])
                continue

        # remaining sources get no match
        return self.post_process_resp(resp)

    def response_list(self, query: str, sources: Set[str]) -> Dict:
        """Return response as list, where the first key-value in each item
        is the source name. Corresponds to `keyed=false` API parameter.

        :param str query: string to match against
        :param List[str] sources: sources to match from
        :return: completed response object to return to client
        """
        response_dict = self.response_keyed(query, sources)
        source_list = []
        for src_name in response_dict['source_matches'].keys():
            src = {
                "source": src_name,
            }
            to_merge = response_dict['source_matches'][src_name]
            src.update(to_merge)

            source_list.append(src)
        response_dict['source_matches'] = source_list

        return response_dict

    @staticmethod
    def _get_service_meta() -> ServiceMeta:
        """Return metadata about gene-normalizer service.

        :return: Service Meta
        """
        return ServiceMeta(version=__version__,
                           response_datetime=str(datetime.now()))

    def search(self,
               query_str: str,
               keyed: bool = False,
               incl: str = '',
               excl: str = '',
               **params) -> SearchService:
        """Return highest match for each source.

        :param str query_str: query, a string, to search for
        :param bool keyed: if true, return response as dict keying source names
            to source objects; otherwise, return list of source objects
        :param str incl: str containing comma-separated names of sources to
            use. Will exclude all other sources. Case-insensitive. Raises
            InvalidParameterException if both incl and excl args are
            provided, or if invalid source names are given.
        :param str excl: str containing comma-separated names of source to
            exclude. Will include all other source. Case-insensitive. Raises
            InvalidParameterException if both incl and excl args are
            provided, or if invalid source names are given.
        :return: SearchService class containing all matches found in sources.
        """
        possible_sources = {
            name.value.lower(): name.value
            for name in SourceName.__members__.values()
        }
        sources = dict()
        for k, v in possible_sources.items():
            if self.db.metadata.get_item(Key={'src_name': v}).get('Item'):
                sources[k] = v

        if not incl and not excl:
            query_sources = set(sources.values())
        elif incl and excl:
            detail = "Cannot request both source inclusions and exclusions."
            raise InvalidParameterException(detail)
        elif incl:
            req_sources = [n.strip() for n in incl.split(',')]
            invalid_sources = []
            query_sources = set()
            for source in req_sources:
                if source.lower() in sources.keys():
                    query_sources.add(sources[source.lower()])
                else:
                    invalid_sources.append(source)
            if invalid_sources:
                detail = f"Invalid source name(s): {invalid_sources}"
                raise InvalidParameterException(detail)
        else:
            req_exclusions = [n.strip() for n in excl.lower().split(',')]
            req_excl_dict = {r.lower(): r for r in req_exclusions}
            invalid_sources = []
            query_sources = set()
            for req_l, req in req_excl_dict.items():
                if req_l not in sources.keys():
                    invalid_sources.append(req)
            for src_l, src in sources.items():
                if src_l not in req_excl_dict.keys():
                    query_sources.add(src)
            if invalid_sources:
                detail = f"Invalid source name(s): {invalid_sources}"
                raise InvalidParameterException(detail)

        query_str = query_str.strip()

        if keyed:
            resp = self.response_keyed(query_str, query_sources)
        else:
            resp = self.response_list(query_str, query_sources)

        resp['service_meta_'] = self._get_service_meta()
        return SearchService(**resp)

    def _add_merged_meta(self, response: NormalizeService) -> NormalizeService:
        """Add source metadata to response object.

        :param Dict response: in-progress response object
        :return: completed response object.
        """
        sources_meta = {}
        gene_descr = response.gene_descriptor
        xrefs = gene_descr.xrefs or []  # type: ignore
        ids = [gene_descr.gene_id] + xrefs  # type: ignore
        for concept_id in ids:
            prefix = concept_id.split(':')[0]
            src_name = PREFIX_LOOKUP[prefix.lower()]
            if src_name not in sources_meta:
                sources_meta[src_name] = self.fetch_meta(src_name)
        response.source_meta_ = sources_meta
        return response

    def _add_alt_matches(self, response: NormService, record: Dict,
                         possible_concepts: List[str]) -> NormService:
        """Add alternate matches warning to response object

        :param NormService response: in-progress response object
        :param Dict record: normalized record
        :param List[str] possible_concepts: other possible matches
        :return: updated response object
        """
        norm_concepts = set()
        for concept_id in possible_concepts:
            r = self.db.get_record_by_id(concept_id, True)
            if r:
                merge_ref = r.get("merge_ref")
                if merge_ref:
                    norm_concepts.add(merge_ref)
        norm_concepts = norm_concepts - {record["concept_id"]}
        if norm_concepts:
            response.warnings.append(
                {"multiple_normalized_concepts_found": list(norm_concepts)})
        return response

    def add_gene_descriptor(
            self,
            response: NormalizeService,
            record: Dict,
            match_type: MatchType,
            possible_concepts: Optional[List[str]] = None) -> NormalizeService:
        """Add gene descriptor to response.

        :param Dict response: Response object
        :param Dict record: Gene record
        :param MatchType match_type: query's match type
        :param Optional[List[str]] possible_concepts: List of other normalized
            concepts found
        :return: Response with gene descriptor
        """
        params = {
            "id": f"normalize.gene:{quote(response.query)}",
            "label": record["symbol"],
            "gene_id": record["concept_id"]
        }

        # xrefs
        if "xrefs" in record and record["xrefs"]:
            params["xrefs"] = record["xrefs"]

        # alternate labels
        alt_labels = set()
        for key in ["previous_symbols", "aliases"]:
            if key in record and record[key]:
                val = record[key]
                if isinstance(val, str):
                    val = [val]
                alt_labels.update(val)
        if alt_labels:
            params["alternate_labels"] = list(alt_labels)

        # extensions
        extensions = list()
        extension_and_record_labels = [
            ("symbol_status", "symbol_status"),
            ("approved_name", "label"),
            ("chromosome_location", "locations"),
            ("associated_with", "associated_with"),
            ("previous_symbols", "previous_symbols"),
        ]
        for ext_label, record_label in extension_and_record_labels:
            if record_label in record and record[record_label]:
                if ext_label == 'chromosome_location':
                    record[record_label] = record[record_label][0]
                extensions.append(
                    Extension(name=ext_label, value=record[record_label]))
        # handle gene types separately because they're wonky
        if record["item_type"] == "identity":
            gene_type = record.get("gene_type")
            if gene_type:
                extensions.append(
                    Extension(name=GeneTypeFieldName[
                        record["src_name"].upper()].value,
                              value=gene_type))
        else:
            for f in GeneTypeFieldName:
                field_name = f.value
                values = record.get(field_name, [])
                for value in values:
                    extensions.append(Extension(name=field_name, value=value))
        if extensions:
            params["extensions"] = extensions

        # add warnings
        if possible_concepts:
            response = self._add_alt_matches(response, record,
                                             possible_concepts)

        response.gene_descriptor = GeneDescriptor(**params)
        response = self._add_merged_meta(response)
        response.match_type = match_type
        return response

    @staticmethod
    def _record_order(record: Dict) -> (int, str):
        """Construct priority order for matching. Only called by sort().

        :param Dict record: individual record item in iterable to sort
        :return: tuple with rank value and concept ID
        """
        src = record['src_name'].upper()
        source_rank = SourcePriority[src]
        return source_rank, record['concept_id']

    @staticmethod
    def _handle_failed_merge_ref(record, response, query) -> Dict:
        """Log + fill out response for a failed merge reference lookup.

        :param Dict record: record containing failed merge_ref
        :param Dict response: in-progress response object
        :param str query: original query value
        :return: response with no match
        """
        logger.error(f"Merge ref lookup failed for ref {record['merge_ref']} "
                     f"in record {record['concept_id']} from query {query}")
        response['match_type'] = MatchType.NO_MATCH
        return response

    def _prepare_normalized_response(self, query: str) -> Dict[str, Any]:
        """Provide base response object for normalize endpoints.

        :param str query: user-provided query
        :return: basic normalization response boilerplate
        """
        return {
            "query":
            query,
            "match_type":
            MatchType.NO_MATCH,
            "warnings":
            self.emit_warnings(query),
            "service_meta_":
            ServiceMeta(version=__version__,
                        response_datetime=str(datetime.now()))
        }

    def normalize(self, query: str) -> NormalizeService:
        """Return normalized concept for query.

        :param str query: String to find normalized concept for
        :return: Normalized gene concept
        """
        response = NormalizeService(**self._prepare_normalized_response(query))
        return self._perform_normalized_lookup(response, query,
                                               self.add_gene_descriptor)

    def _resolve_merge(
            self,
            response: NormService,
            record: Dict,
            match_type: MatchType,
            callback: Callable,
            possible_concepts: Optional[List[str]] = None) -> NormService:
        """Given a record, return the corresponding normalized record

        :param NormalizationService response: in-progress response object
        :param Dict record: record to retrieve normalized concept for
        :param MatchType match_type: type of match that returned these records
        :param Callable callback: response constructor method
        :param Optional[List[str]] possible_concepts: alternate possible matches
        :return: Normalized response object
        """
        merge_ref = record.get("merge_ref")
        if merge_ref:
            # follow merge_ref
            merge = self.db.get_record_by_id(merge_ref, False, True)
            if merge is None:
                query = response.query
                logger.error(
                    f"Merge ref lookup failed for ref {record['merge_ref']} "
                    f"in record {record['concept_id']} from query `{query}`")
                return response
            else:
                return callback(response, merge, match_type, possible_concepts)
        else:
            # record is sole member of concept group
            return callback(response, record, match_type, possible_concepts)

    def _get_matches_by_type(self, query: str, match_type: str) -> List[Dict]:
        """Get matches list for match tier.
        :param str query: user-provided query
        :param str match_type: keyword of match type to check
        :return: List of records matching the query and match level
        """
        matching_refs = self.db.get_records_by_type(query, match_type)
        matching_records = [
            self.db.get_record_by_id(m["concept_id"], False)
            for m in matching_refs
        ]
        return sorted(matching_records, key=self._record_order)  # type: ignore

    def _perform_normalized_lookup(self, response: NormService, query: str,
                                   response_builder: Callable) -> NormService:
        """Retrieve normalized concept, for use in normalization endpoints
        :param NormService response: in-progress response object
        :param str query: user-provided query
        :param Callable response_builder: response constructor callback method
        :return: completed service response object
        """
        if query == "":
            return response
        query_str = query.lower().strip()

        # check merged concept ID match
        record = self.db.get_record_by_id(query_str,
                                          case_sensitive=False,
                                          merge=True)
        if record:
            return response_builder(response, record, MatchType.CONCEPT_ID)

        # check concept ID match
        record = self.db.get_record_by_id(query_str, case_sensitive=False)
        if record:
            return self._resolve_merge(response, record, MatchType.CONCEPT_ID,
                                       response_builder)

        for match_type in ITEM_TYPES.values():
            # get matches list for match tier
            matching_refs = self.db.get_records_by_type(query_str, match_type)
            matching_records = \
                [self.db.get_record_by_id(m['concept_id'], False)
                 for m in matching_refs]
            matching_records.sort(key=self._record_order)  # type: ignore

            if len(matching_refs) > 1:
                possible_concepts = \
                    [ref["concept_id"] for ref in matching_refs]
            else:
                possible_concepts = None

            # attempt merge ref resolution until successful
            for match in matching_records:
                assert match is not None
                record = self.db.get_record_by_id(match["concept_id"], False)
                if record:
                    match_type_value = MatchType[match_type.upper()]
                    return self._resolve_merge(response, record,
                                               match_type_value,
                                               response_builder,
                                               possible_concepts)
        return response

    def _add_normalized_records(
        self,
        response: UnmergedNormalizationService,
        normalized_record: Dict,
        match_type: MatchType,
        possible_concepts: Optional[List[str]] = None
    ) -> UnmergedNormalizationService:
        """Add individual records to unmerged normalize response.

        :param UnmergedNormalizationService response: in-progress response
        :param Dict normalized_record: record associated with normalized concept,
            either merged or single identity
        :param MatchType match_type: type of match achieved
        :param Optional[List[str]] possible_concepts: other possible results
        :return: Completed response object
        """
        response.match_type = match_type
        response.normalized_concept_id = normalized_record["concept_id"]
        if normalized_record["item_type"] == "identity":
            record_source = SourceName[normalized_record["src_name"].upper()]
            response.source_matches[record_source] = MatchesNormalized(
                records=[
                    BaseGene(**self._cast_location_ints(normalized_record))
                ],
                source_meta_=self.fetch_meta(record_source.value))
        else:
            concept_ids = [normalized_record["concept_id"]] + \
                normalized_record.get("xrefs", [])
            for concept_id in concept_ids:
                record = self.db.get_record_by_id(concept_id,
                                                  case_sensitive=False)
                if not record:
                    continue
                record_source = SourceName[record["src_name"].upper()]
                gene = BaseGene(**self._cast_location_ints(record))
                if record_source in response.source_matches:
                    response.source_matches[record_source].records.append(gene)
                else:
                    response.source_matches[record_source] = MatchesNormalized(
                        records=[gene],
                        source_meta_=self.fetch_meta(record_source.value))
        if possible_concepts:
            response = self._add_alt_matches(response, normalized_record,
                                             possible_concepts)
        return response

    def normalize_unmerged(self, query: str) -> UnmergedNormalizationService:
        """Return all source records under the normalized concept for the
        provided query string.

        :param str query: string to search against
        :return: Normalized response object
        """
        response = UnmergedNormalizationService(
            source_matches={}, **self._prepare_normalized_response(query))
        return self._perform_normalized_lookup(response, query,
                                               self._add_normalized_records)
import sys
from pathlib import Path
from timeit import default_timer as timer
import click
from botocore.exceptions import ClientError
import logging

APP_ROOT = Path(__file__).resolve().parents[1]
sys.path.append(f"{APP_ROOT}")

from gene.database import Database  # noqa: E402

logger = logging.getLogger('gene')
logger.setLevel(logging.DEBUG)

db = Database()


def add_item_type(label_and_type: str, concept_id: str, item_type: str):
    """Add item_type to individual db item."""
    key = {'label_and_type': label_and_type, 'concept_id': concept_id}
    update_expression = "set item_type=:r"
    update_values = {':r': item_type}
    try:
        db.genes.update_item(Key=key,
                             UpdateExpression=update_expression,
                             ExpressionAttributeValues=update_values)
    except ClientError as e:
        logger.error(f"boto3 client error in `database.update_record()`: "
                     f"{e.response['Error']['Message']}")