def get_statements_from_variants( graphkb_conn: GraphKBConnection, variants: List[Record] ) -> List[Statement]: """ Given a list of variant records from GraphKB, return all the related statements Args: graphkb_conn (GraphKBConnection): the graphkb api connection object variants (list.<dict>): list of variant records Returns: list.<dict>: list of Statement records from graphkb """ return_props = ( BASE_RETURN_PROPERTIES + ['sourceId', 'source.name', 'source.displayName'] + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES] + ['reviewStatus'] ) statements = graphkb_conn.query( { 'target': 'Statement', 'filters': {'conditions': convert_to_rid_list(variants), 'operator': 'CONTAINSANY'}, 'returnProperties': return_props, }, ) return [s for s in statements if s['reviewStatus'] != FAILED_REVIEW_STATUS]
def aggregate_statements( graphkb_conn: GraphKBConnection, template: str, statements: List[Statement], disease_matches: Set[str], ) -> Dict[str, str]: """ Group Statements that only differ in disease conditions and evidence """ hash_other: Dict[Tuple, List[Statement]] = {} def generate_key(statement: Statement) -> Tuple: result = [ cond['displayName'] for cond in filter_by_record_class( statement['conditions'], 'Disease', exclude=True) if cond['@rid'] != statement['subject']['@rid'] ] if statement.get('subject', {}).get('@class', 'Disease') != 'Disease': subject = statement['subject'] if subject['@class'] == 'Therapy': alt = get_preferred_drug_representation( graphkb_conn, subject['@rid']) statement['subject'] = alt result.append(statement['subject']['displayName']) result.append(statement['relevance']['displayName']) result.append(statement['displayNameTemplate']) return tuple(sorted(set(result))) for statement in statements: key = generate_key(statement) hash_other.setdefault(key, []).append(statement) result = {} for key, group in hash_other.items(): conditions = [] subjects = [] evidence = [] relevance = group[0]['relevance'] template = group[0]['displayNameTemplate'] for statement in group: conditions.extend(statement['conditions']) evidence.extend(statement['evidence']) subjects.append(statement['subject']) sentence = substitute_sentence_template( template, conditions, subjects, relevance, evidence, statement_rids=convert_to_rid_list(group), disease_matches=disease_matches, ) for statement in group: result[statement['@rid']] = sentence return result
for match in variant_matches: print(variant_name, 'will match', match['displayName']) # return properties should be customized to the users needs return_props = (BASE_RETURN_PROPERTIES + ['sourceId', 'source.name', 'source.displayName'] + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES]) statements = graphkb_conn.query({ 'target': 'Statement', 'filters': { 'conditions': convert_to_rid_list(variant_matches), 'operator': 'CONTAINSANY' }, 'returnProperties': return_props, }) for statement in statements[:5]: print( statement['relevance']['displayName'], statement['subject']['displayName'], statement['source']['displayName'] if statement['source'] else '', ) BASE_THERAPEUTIC_TERMS = 'therapeutic efficacy' therapeutic_terms = get_term_tree(graphkb_conn,
def annotate_variant(graphkb_conn: GraphKBConnection, raw_variant_name: str, include_unmatched: bool = False) -> List[Dict[str, str]]: results = [] variant_name = convert_aa_3to1(raw_variant_name) if 'c.*' in variant_name: results.append({ 'variant': raw_variant_name, 'error': f'skipping unsupported notation: {variant_name}' }) return results print(f'processing: {variant_name}') try: variant_matches = match_positional_variant(graphkb_conn, variant_name) except FeatureNotFoundError: if include_unmatched: results.append({'variant': raw_variant_name}) return results except Exception as err: results.append({'variant': raw_variant_name, 'error': str(err)}) return results if variant_matches: print(f'{variant_name} matches {len(variant_matches)} variant records') # return properties should be customized to the users needs return_props = (BASE_RETURN_PROPERTIES + ['sourceId', 'source.name', 'source.displayName'] + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES] + ['reviewStatus']) statements = typing.cast( Statement, graphkb_conn.query({ 'target': 'Statement', 'filters': { 'conditions': convert_to_rid_list(variant_matches), 'operator': 'CONTAINSANY', }, 'returnProperties': return_props, }), ) if not statements: if include_unmatched: results.append({ 'variant_matches': ';'.join(sorted([v['displayName'] for v in variant_matches])), 'variant': raw_variant_name, }) return results print(f'{variant_name} matches {len(statements)} statements') for statement in statements: row = { 'variant_matches': ';'.join(sorted([v['displayName'] for v in variant_matches])), 'variant': raw_variant_name, 'statement.relevance': statement['relevance']['displayName'], 'statement.@rid': statement['@rid'], 'statement.subject': statement['subject']['displayName'], 'statement.source': statement['source']['displayName'] if statement['source'] else '', 'statement.evidence': ';'.join(sorted([e['displayName'] for e in statement['evidence']])), 'statement.conditions': ';'.join( sorted([e['displayName'] for e in statement['conditions']])), 'statement.evidence_level': ';'.join( sorted([ e['displayName'] for e in (statement['evidenceLevel'] or []) ])), 'statement.review_status': statement['reviewStatus'], 'is_therapeutic': bool(statement['relevance']['@rid'] in therapeutic_terms), } results.append(row) return results
def get_variant(row): if not pd.isnull(row['ANN[*].HGVS_P']): return row['ANN[*].GENE'] + ':' + row['ANN[*].HGVS_P'] # fall back to cds variant description when no protein change given if not pd.isnull(row['ANN[*].HGVS_C']): return row['ANN[*].GENE'] + ':' + row['ANN[*].HGVS_C'] return None input_df['variant'] = input_df.apply(get_variant, axis=1) BASE_THERAPEUTIC_TERMS = 'therapeutic efficacy' therapeutic_terms = set( convert_to_rid_list( get_term_tree(graphkb_conn, BASE_THERAPEUTIC_TERMS, include_superclasses=False))) results: List[Dict] = [] for raw_variant_name in sorted(input_df['variant'].unique()): try: results.extend( annotate_variant(graphkb_conn, raw_variant_name, args.include_unmatched)) except Exception as err: print(err) print(f'writing: {args.output}') df = pd.DataFrame.from_records(results) # re-add the filename to the output