コード例 #1
0
def get_all_stats(model_name, test_corpora, date):
    """Get all stats for a model and its test corpora.

    Parameters
    ----------
    model_name : str
        A name of the model to get the updates for.
    test_corpora : list[str]
        A list of test corpora names to get the test updates for.
    date : str
        A date for which the updates should be generated.

    Returns
    -------
    model_stats : dict
        A dictionary containing the stats for the given model.
    test_stats_by_corpus : dict
        A dictionary of test statistics keyed by test corpus name.
    """
    model_stats, _ = get_model_stats(model_name, 'model', date=date)
    test_stats_by_corpus = {}
    for test_corpus in test_corpora:
        test_stats, _ = get_model_stats(model_name,
                                        'test',
                                        tests=test_corpus,
                                        date=date)
        if not test_stats:
            logger.info(f'Could not find test stats for {test_corpus}')
        test_stats_by_corpus[test_corpus] = test_stats
    return model_stats, test_stats_by_corpus
コード例 #2
0
ファイル: api.py プロジェクト: kolusask/emmaa
def get_statement_evidence_page():
    stmt_hashes = request.args.getlist('stmt_hash')
    source = request.args.get('source')
    model = request.args.get('model')
    test_corpus = request.args.get('test_corpus', '')
    display_format = request.args.get('format', 'html')
    stmts = []
    if source == 'model_statement':
        test_stats, _ = get_model_stats(model, 'test')
        stmt_counts = test_stats['test_round_summary'].get(
            'path_stmt_counts', [])
        stmt_counts_dict = dict(stmt_counts)
        all_stmts = _load_stmts_from_cache(model)
        for stmt in all_stmts:
            for stmt_hash in stmt_hashes:
                if str(stmt.get_hash()) == str(stmt_hash):
                    stmts.append(stmt)
    elif source == 'test':
        if not test_corpus:
            abort(Response(f'Need test corpus name to load evidence', 404))
        tests = _load_tests_from_cache(test_corpus)
        stmt_counts_dict = None
        for t in tests:
            for stmt_hash in stmt_hashes:
                if str(t.stmt.get_hash()) == str(stmt_hash):
                    stmts.append(t.stmt)
    else:
        abort(Response(f'Source should be model_statement or test', 404))
    if display_format == 'html':
        stmt_rows = []
        stmts_by_hash = {}
        for stmt in stmts:
            stmts_by_hash[str(stmt.get_hash())] = stmt
        curations = get_curations(pa_hash=stmt_hashes)
        cur_dict = defaultdict(list)
        for cur in curations:
            cur_dict[(cur.pa_hash,
                      cur.source_hash)].append({'error_type': cur.tag})
        cur_counts = _count_curations(curations, stmts_by_hash)
        for stmt in stmts:
            stmt_row = _get_stmt_row(stmt, source, model, cur_counts,
                                     test_corpus, stmt_counts_dict, cur_dict,
                                     True)
            stmt_rows.append(stmt_row)
    else:
        stmt_json = json.dumps(stmts[0].to_json(), indent=1)
        return Response(stmt_json, mimetype='application/json')
    return render_template('evidence_template.html',
                           stmt_rows=stmt_rows,
                           model=model,
                           source=source,
                           test_corpus=test_corpus if test_corpus else None,
                           table_title='Statement Evidence and Curation',
                           msg=None,
                           is_all_stmts=False)
コード例 #3
0
ファイル: test_s3.py プロジェクト: indralab/emmaa
def test_get_model_statistics():
    # Local imports are recommended when using moto
    from emmaa.model import get_model_stats
    client = setup_bucket(add_model=True,
                          add_model_stats=True,
                          add_test_stats=True)
    # Get latest model stats
    model_stats, key = get_model_stats('test',
                                       'model',
                                       bucket=TEST_BUCKET_NAME)
    assert isinstance(model_stats, dict)
    assert key.startswith('model_stats/test/model_stats_')
    # Get latest test stats
    test_stats, key = get_model_stats('test',
                                      'test',
                                      'simple_tests',
                                      bucket=TEST_BUCKET_NAME)
    assert isinstance(test_stats, dict)
    assert key.startswith('stats/test/test_stats_')
    # Try with a different date
    new_stats, key = get_model_stats('test',
                                     'model',
                                     date='2020-01-01',
                                     bucket=TEST_BUCKET_NAME)
    assert not new_stats
    assert not key
    # Put missing file and try again
    client.put_object(
        Body=json.dumps(previous_model_stats, indent=1),
        Bucket=TEST_BUCKET_NAME,
        Key=f'model_stats/test/model_stats_2020-01-01-00-00-00.json')
    new_stats, key = get_model_stats('test',
                                     'model',
                                     date='2020-01-01',
                                     bucket=TEST_BUCKET_NAME)
    assert new_stats
    assert isinstance(new_stats, dict)
    assert key == 'model_stats/test/model_stats_2020-01-01-00-00-00.json'
コード例 #4
0
ファイル: api.py プロジェクト: kolusask/emmaa
def get_all_statements_page(model):
    sort_by = request.args.get('sort_by', 'evidence')
    page = int(request.args.get('page', 1))
    filter_curated = request.args.get('filter_curated', False)
    filter_curated = (filter_curated == 'true')
    offset = (page - 1) * 1000
    stmts = _load_stmts_from_cache(model)
    stmts_by_hash = {}
    for stmt in stmts:
        stmts_by_hash[str(stmt.get_hash())] = stmt
    msg = None
    curations = get_curations()
    cur_counts = _count_curations(curations, stmts_by_hash)
    if filter_curated:
        stmts = [
            stmt for stmt in stmts if str(stmt.get_hash()) not in cur_counts
        ]
    test_stats, _ = get_model_stats(model, 'test')
    stmt_counts = test_stats['test_round_summary'].get('path_stmt_counts', [])
    stmt_counts_dict = dict(stmt_counts)
    if len(stmts) % 1000 == 0:
        total_pages = len(stmts) // 1000
    else:
        total_pages = len(stmts) // 1000 + 1
    if page + 1 <= total_pages:
        next_page = page + 1
    else:
        next_page = None
    if page != 1:
        prev_page = page - 1
    else:
        prev_page = None
    if sort_by == 'evidence':
        stmts = sorted(stmts, key=lambda x: len(x.evidence),
                       reverse=True)[offset:offset + 1000]
    elif sort_by == 'paths':
        if not stmt_counts:
            msg = 'Sorting by paths is not available, sorting by evidence'
            stmts = sorted(stmts, key=lambda x: len(x.evidence),
                           reverse=True)[offset:offset + 1000]
        else:
            stmts = []
            for (stmt_hash, count) in stmt_counts[offset:offset + 1000]:
                try:
                    stmts.append(stmts_by_hash[stmt_hash])
                except KeyError:
                    continue
    stmt_rows = []
    for stmt in stmts:
        stmt_row = _get_stmt_row(stmt, 'model_statement', model, cur_counts,
                                 None, stmt_counts_dict)
        stmt_rows.append(stmt_row)
    table_title = f'All statements in {model.upper()} model.'

    if does_exist(EMMAA_BUCKET_NAME,
                  f'assembled/{model}/latest_statements_{model}'):
        fkey = f'assembled/{model}/latest_statements_{model}.json'
        link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}'
    elif does_exist(EMMAA_BUCKET_NAME, f'assembled/{model}/statements_'):
        fkey = find_latest_s3_file(EMMAA_BUCKET_NAME,
                                   f'assembled/{model}/statements_', '.json')
        link = f'https://{EMMAA_BUCKET_NAME}.s3.amazonaws.com/{fkey}'
    else:
        link = None
    return render_template('evidence_template.html',
                           stmt_rows=stmt_rows,
                           model=model,
                           source='model_statement',
                           table_title=table_title,
                           msg=msg,
                           is_all_stmts=True,
                           prev=prev_page,
                           next=next_page,
                           filter_curated=filter_curated,
                           sort_by=sort_by,
                           link=link)
コード例 #5
0
ファイル: api.py プロジェクト: kolusask/emmaa
def get_model_tests_page(model):
    model_type = request.args.get('model_type')
    test_hash = request.args.get('test_hash')
    test_corpus = request.args.get('test_corpus')
    if not test_corpus:
        abort(Response('Test corpus has to be provided', 404))
    date = request.args.get('date')
    if model_type not in ALL_MODEL_TYPES:
        abort(Response(f'Model type {model_type} does not exist', 404))
    test_stats, file_key = get_model_stats(model,
                                           'test',
                                           tests=test_corpus,
                                           date=date)
    if not test_stats:
        abort(Response(f'Data for {model} for {date} was not found', 404))
    try:
        current_test = \
            test_stats['test_round_summary']['all_test_results'][test_hash]
    except KeyError:
        abort(Response(f'Result for this test does not exist for {date}', 404))
    current_model_types = [
        mt for mt in ALL_MODEL_TYPES if mt in test_stats['test_round_summary']
    ]
    test = current_test["test"]
    test_status, path_list = current_test[model_type]
    correct, incorrect = _label_curations()
    if isinstance(path_list, list):
        for path in path_list:
            for edge in path['edge_list']:
                for stmt in edge['stmts']:
                    cur = ''
                    url = stmt[0]
                    if 'stmt_hash' in url:
                        stmt_hashes = parse.parse_qs(
                            parse.urlparse(url).query)['stmt_hash']
                        cur = _set_curation(stmt_hashes, correct, incorrect)
                    stmt.append(cur)
    latest_date = get_latest_available_date(model, test_corpus)
    prefix = f'stats/{model}/test_stats_{test_corpus}_'
    cur_ix = find_index_of_s3_file(file_key, EMMAA_BUCKET_NAME, prefix)
    if test_hash in test_stats['tests_delta']['applied_hashes_delta']['added']:
        prev_date = None
    elif (cur_ix + 1) < find_number_of_files_on_s3(EMMAA_BUCKET_NAME, prefix,
                                                   '.json'):
        prev_date = last_updated_date(model,
                                      'test_stats',
                                      'date',
                                      tests=test_corpus,
                                      extension='.json',
                                      n=(cur_ix + 1),
                                      bucket=EMMAA_BUCKET_NAME)
    else:
        prev_date = None
    if cur_ix > 0:
        next_date = last_updated_date(model,
                                      'test_stats',
                                      'date',
                                      tests=test_corpus,
                                      extension='.json',
                                      n=(cur_ix - 1),
                                      bucket=EMMAA_BUCKET_NAME)
    else:
        next_date = None
    return render_template('tests_template.html',
                           link_list=link_list,
                           model=model,
                           model_type=model_type,
                           all_model_types=current_model_types,
                           test_hash=test_hash,
                           test=test,
                           test_status=test_status,
                           path_list=path_list,
                           formatted_names=FORMATTED_TYPE_NAMES,
                           date=date,
                           latest_date=latest_date,
                           prev=prev_date,
                           next=next_date)
コード例 #6
0
ファイル: api.py プロジェクト: kolusask/emmaa
def get_model_dashboard(model):
    test_corpus = request.args.get(
        'test_corpus', _default_test(model, get_model_config(model)))
    if not test_corpus:
        abort(Response('Could not identify test corpus', 404))
    date = request.args.get('date',
                            get_latest_available_date(model, test_corpus))
    tab = request.args.get('tab', 'model')
    user, roles = resolve_auth(dict(request.args))
    model_meta_data = _get_model_meta_data()
    model_stats, _ = get_model_stats(model, 'model', date=date)
    test_stats, _ = get_model_stats(model,
                                    'test',
                                    tests=test_corpus,
                                    date=date)
    if not model_stats or not test_stats:
        abort(
            Response(
                f'Data for {model} and {test_corpus} for {date} '
                f'was not found', 404))
    ndex_id = 'None available'
    description = 'None available'
    for mid, mmd in model_meta_data:
        if mid == model:
            ndex_id = mmd['ndex']['network']
            description = mmd['description']
    if ndex_id == 'None available':
        logger.warning(f'No ndex ID found for {model}')
    available_tests = _get_test_corpora(model)
    latest_date = get_latest_available_date(model, test_corpus)
    model_info_contents = [
        [('', 'Model Description', ''), ('', description, '')],
        [('', 'Latest Data Available', ''), ('', latest_date, '')],
        [('', 'Data Displayed', ''),
         ('', date, 'Click on the point on time graph to see earlier results')
         ],
        [('', 'Network on Ndex', ''),
         (f'http://www.ndexbio.org/#/network/{ndex_id}', ndex_id,
          'Click to see network on Ndex')]
    ]
    test_data = test_stats['test_round_summary'].get('test_data')
    test_info_contents = None
    if test_data:
        test_info_contents = [[('', k.capitalize(), ''), ('', v, '')]
                              for k, v in test_data.items()]
    current_model_types = [
        mt for mt in ALL_MODEL_TYPES if mt in test_stats['test_round_summary']
    ]
    # Get correct and incorrect curation hashes to pass it per stmt
    correct, incorrect = _label_curations()
    # Filter out rows with all tests == 'n_a'
    all_tests = []
    for k, v in test_stats['test_round_summary']['all_test_results'].items():
        cur = _set_curation(k, correct, incorrect)
        v['test'].append(cur)
        if all(v[mt][0].lower() == 'n_a' for mt in current_model_types):
            continue
        else:
            all_tests.append((k, v))

    all_stmts = model_stats['model_summary']['all_stmts']
    for st_hash, st_value in all_stmts.items():
        url_param = parse.urlencode({
            'stmt_hash': st_hash,
            'source': 'model_statement',
            'model': model
        })
        st_value[0] = f'/evidence?{url_param}'
        st_value[2] = stmt_db_link_msg
        cur = _set_curation(st_hash, correct, incorrect)
        st_value.append(cur)
    most_supported = model_stats['model_summary']['stmts_by_evidence'][:10]
    top_stmts_counts = [((all_stmts[h]), ('', str(c), ''))
                        for h, c in most_supported]
    added_stmts_hashes = \
        model_stats['model_delta']['statements_hashes_delta']['added']
    if len(added_stmts_hashes) > 0:
        added_stmts = [((all_stmts[h]), ) for h in added_stmts_hashes]
    else:
        added_stmts = 'No new statements were added'
    return render_template(
        'model_template.html',
        model=model,
        model_data=model_meta_data,
        model_stats_json=model_stats,
        test_stats_json=test_stats,
        test_corpus=test_corpus,
        available_tests=available_tests,
        link_list=link_list,
        user_email=user.email if user else "",
        stmts_counts=top_stmts_counts,
        added_stmts=added_stmts,
        model_info_contents=model_info_contents,
        test_info_contents=test_info_contents,
        model_types=[
            "Test", *[FORMATTED_TYPE_NAMES[mt] for mt in current_model_types]
        ],
        new_applied_tests=_new_applied_tests(test_stats, current_model_types,
                                             model, date, test_corpus),
        all_test_results=_format_table_array(all_tests, current_model_types,
                                             model, date, test_corpus),
        new_passed_tests=_new_passed_tests(model, test_stats,
                                           current_model_types, date,
                                           test_corpus),
        date=date,
        latest_date=latest_date,
        tab=tab)
コード例 #7
0
def get_model_deltas(model_name, test_corpora, date, bucket=EMMAA_BUCKET_NAME):
    """Get deltas from model and test stats for further use in tweets and
    email notifications.

    Parameters
    ----------
    model_name : str
        A name of the model to get the updates for.
    test_corpora : list[str]
        A list of test corpora names to get the test updates for.
    date : str
        A date for which the updates should be generated.
    bucket : str
        A name of S3 bucket where the stats files are stored.

    Returns
    -------
    deltas : dict
        A dictionary containing the deltas for the given model and test
        corpora.
    """
    deltas = {}
    model_stats, _ = get_model_stats(model_name, 'model', date=date)
    test_stats_by_corpus = {}
    for test_corpus in test_corpora:
        test_stats, _ = get_model_stats(model_name,
                                        'test',
                                        tests=test_corpus,
                                        date=date)
        if not test_stats:
            logger.info(f'Could not find test stats for {test_corpus}')
        test_stats_by_corpus[test_corpus] = test_stats
    if not model_stats or not test_stats_by_corpus:
        logger.warning('Stats are not found, cannot generate deltas')
        return deltas
    deltas['model_name'] = model_name
    deltas['date'] = date
    # Model deltas
    stmts_delta = model_stats['model_delta']['statements_hashes_delta']
    paper_delta = model_stats['paper_delta']['raw_paper_ids_delta']
    new_papers = len(paper_delta['added'])
    deltas['stmts_delta'] = stmts_delta
    deltas['new_papers'] = new_papers
    # Test deltas
    deltas['tests'] = {}
    for test_corpus, test_stats in test_stats_by_corpus.items():
        test_deltas = {}
        test_name = None
        test_data = test_stats['test_round_summary'].get('test_data')
        if test_data:
            test_name = test_data.get('name')
        test_deltas['name'] = test_name
        test_deltas['passed'] = {}
        for k, v in test_stats['tests_delta'].items():
            if k == 'applied_hashes_delta':
                applied_delta = v
                test_deltas['applied_tests'] = applied_delta
            else:
                mc_type = k
                passed_delta = v['passed_hashes_delta']
                test_deltas['passed'][mc_type] = passed_delta
        deltas['tests'][test_corpus] = test_deltas
    return deltas