Esempio n. 1
0
def get_statements_from_variants(
    graphkb_conn: GraphKBConnection, variants: List[Record]
) -> List[Statement]:
    """
    Given a list of variant records from GraphKB, return all the related statements

    Args:
        graphkb_conn (GraphKBConnection): the graphkb api connection object
        variants (list.<dict>): list of variant records

    Returns:
        list.<dict>: list of Statement records from graphkb
    """
    return_props = (
        BASE_RETURN_PROPERTIES
        + ['sourceId', 'source.name', 'source.displayName']
        + [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES]
        + [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES]
        + [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES]
        + [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES]
        + [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES]
        + ['reviewStatus']
    )

    statements = graphkb_conn.query(
        {
            'target': 'Statement',
            'filters': {'conditions': convert_to_rid_list(variants), 'operator': 'CONTAINSANY'},
            'returnProperties': return_props,
        },
    )
    return [s for s in statements if s['reviewStatus'] != FAILED_REVIEW_STATUS]
Esempio n. 2
0
def aggregate_statements(
    graphkb_conn: GraphKBConnection,
    template: str,
    statements: List[Statement],
    disease_matches: Set[str],
) -> Dict[str, str]:
    """
    Group Statements that only differ in disease conditions and evidence
    """
    hash_other: Dict[Tuple, List[Statement]] = {}

    def generate_key(statement: Statement) -> Tuple:
        result = [
            cond['displayName'] for cond in filter_by_record_class(
                statement['conditions'], 'Disease', exclude=True)
            if cond['@rid'] != statement['subject']['@rid']
        ]
        if statement.get('subject', {}).get('@class', 'Disease') != 'Disease':
            subject = statement['subject']
            if subject['@class'] == 'Therapy':
                alt = get_preferred_drug_representation(
                    graphkb_conn, subject['@rid'])
                statement['subject'] = alt
            result.append(statement['subject']['displayName'])
        result.append(statement['relevance']['displayName'])
        result.append(statement['displayNameTemplate'])
        return tuple(sorted(set(result)))

    for statement in statements:
        key = generate_key(statement)
        hash_other.setdefault(key, []).append(statement)

    result = {}
    for key, group in hash_other.items():
        conditions = []
        subjects = []
        evidence = []
        relevance = group[0]['relevance']
        template = group[0]['displayNameTemplate']
        for statement in group:
            conditions.extend(statement['conditions'])
            evidence.extend(statement['evidence'])
            subjects.append(statement['subject'])

        sentence = substitute_sentence_template(
            template,
            conditions,
            subjects,
            relevance,
            evidence,
            statement_rids=convert_to_rid_list(group),
            disease_matches=disease_matches,
        )

        for statement in group:
            result[statement['@rid']] = sentence
    return result
Esempio n. 3
0
for match in variant_matches:
    print(variant_name, 'will match', match['displayName'])

# return properties should be customized to the users needs
return_props = (BASE_RETURN_PROPERTIES +
                ['sourceId', 'source.name', 'source.displayName'] +
                [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                [f'evidenceLevel.{p}' for p in GENERIC_RETURN_PROPERTIES])

statements = graphkb_conn.query({
    'target': 'Statement',
    'filters': {
        'conditions': convert_to_rid_list(variant_matches),
        'operator': 'CONTAINSANY'
    },
    'returnProperties': return_props,
})

for statement in statements[:5]:
    print(
        statement['relevance']['displayName'],
        statement['subject']['displayName'],
        statement['source']['displayName'] if statement['source'] else '',
    )

BASE_THERAPEUTIC_TERMS = 'therapeutic efficacy'

therapeutic_terms = get_term_tree(graphkb_conn,
Esempio n. 4
0
def annotate_variant(graphkb_conn: GraphKBConnection,
                     raw_variant_name: str,
                     include_unmatched: bool = False) -> List[Dict[str, str]]:
    results = []
    variant_name = convert_aa_3to1(raw_variant_name)

    if 'c.*' in variant_name:
        results.append({
            'variant':
            raw_variant_name,
            'error':
            f'skipping unsupported notation: {variant_name}'
        })
        return results

    print(f'processing: {variant_name}')

    try:
        variant_matches = match_positional_variant(graphkb_conn, variant_name)
    except FeatureNotFoundError:
        if include_unmatched:
            results.append({'variant': raw_variant_name})
        return results
    except Exception as err:
        results.append({'variant': raw_variant_name, 'error': str(err)})
        return results

    if variant_matches:
        print(f'{variant_name} matches {len(variant_matches)} variant records')
    # return properties should be customized to the users needs
    return_props = (BASE_RETURN_PROPERTIES +
                    ['sourceId', 'source.name', 'source.displayName'] +
                    [f'conditions.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                    [f'subject.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                    [f'evidence.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                    [f'relevance.{p}' for p in GENERIC_RETURN_PROPERTIES] +
                    [f'evidenceLevel.{p}'
                     for p in GENERIC_RETURN_PROPERTIES] + ['reviewStatus'])

    statements = typing.cast(
        Statement,
        graphkb_conn.query({
            'target': 'Statement',
            'filters': {
                'conditions': convert_to_rid_list(variant_matches),
                'operator': 'CONTAINSANY',
            },
            'returnProperties': return_props,
        }),
    )
    if not statements:
        if include_unmatched:
            results.append({
                'variant_matches':
                ';'.join(sorted([v['displayName'] for v in variant_matches])),
                'variant':
                raw_variant_name,
            })
        return results
    print(f'{variant_name} matches {len(statements)} statements')

    for statement in statements:
        row = {
            'variant_matches':
            ';'.join(sorted([v['displayName'] for v in variant_matches])),
            'variant':
            raw_variant_name,
            'statement.relevance':
            statement['relevance']['displayName'],
            'statement.@rid':
            statement['@rid'],
            'statement.subject':
            statement['subject']['displayName'],
            'statement.source':
            statement['source']['displayName'] if statement['source'] else '',
            'statement.evidence':
            ';'.join(sorted([e['displayName']
                             for e in statement['evidence']])),
            'statement.conditions':
            ';'.join(
                sorted([e['displayName'] for e in statement['conditions']])),
            'statement.evidence_level':
            ';'.join(
                sorted([
                    e['displayName']
                    for e in (statement['evidenceLevel'] or [])
                ])),
            'statement.review_status':
            statement['reviewStatus'],
            'is_therapeutic':
            bool(statement['relevance']['@rid'] in therapeutic_terms),
        }
        results.append(row)
    return results
Esempio n. 5
0
def get_variant(row):
    if not pd.isnull(row['ANN[*].HGVS_P']):
        return row['ANN[*].GENE'] + ':' + row['ANN[*].HGVS_P']
    # fall back to cds variant description when no protein change given
    if not pd.isnull(row['ANN[*].HGVS_C']):
        return row['ANN[*].GENE'] + ':' + row['ANN[*].HGVS_C']
    return None


input_df['variant'] = input_df.apply(get_variant, axis=1)

BASE_THERAPEUTIC_TERMS = 'therapeutic efficacy'

therapeutic_terms = set(
    convert_to_rid_list(
        get_term_tree(graphkb_conn,
                      BASE_THERAPEUTIC_TERMS,
                      include_superclasses=False)))

results: List[Dict] = []

for raw_variant_name in sorted(input_df['variant'].unique()):
    try:
        results.extend(
            annotate_variant(graphkb_conn, raw_variant_name,
                             args.include_unmatched))
    except Exception as err:
        print(err)

print(f'writing: {args.output}')
df = pd.DataFrame.from_records(results)
# re-add the filename to the output