Exemple #1
0
def _get_stmt_row(stmt,
                  source,
                  model,
                  cur_counts,
                  test_corpus=None,
                  path_counts=None,
                  cur_dict=None,
                  with_evid=False):
    stmt_hash = str(stmt.get_hash())
    english = _format_stmt_text(stmt)
    evid_count = len(stmt.evidence)
    evid = []
    if with_evid and cur_dict is not None:
        evid = _format_evidence_text(
            stmt, cur_dict, ['correct', 'act_vs_amt', 'hypothesis'])[:10]
    params = {
        'stmt_hash': stmt_hash,
        'source': source,
        'model': model,
        'format': 'json'
    }
    if test_corpus:
        params.update({'test_corpus': test_corpus})
    url_param = parse.urlencode(params)
    json_link = f'/evidence?{url_param}'
    path_count = 0
    if path_counts:
        path_count = path_counts.get(stmt_hash)
    badges = _make_badges(evid_count, json_link, path_count,
                          cur_counts.get(stmt_hash))
    stmt_row = [(stmt.get_hash(), english, evid, evid_count, badges)]
    return stmt_row
def test_tag_bad_text():
    ev = Evidence("bogus",
                  text="<Foo> binds Bar& (<10 & >20)",
                  annotations={"agents": {
                      "raw_text": ["<Foo>", "Bar&"]
                  }})
    stmt = Complex([Agent("Foo"), Agent("Bar")], evidence=[ev])
    ev_list = _format_evidence_text(stmt)
    fmt_ev = ev_list[0]
    assert fmt_ev['text'] == (
        "<span class=\"badge badge-other\">&lt;Foo&gt;"
        "</span> binds <span class=\"badge badge-other\">"
        "Bar&amp;</span> (&lt;10 &amp; &gt;20)")
Exemple #3
0
def get_tests_by_hash(test_corpus, hash_val):
    tests = _load_tests_from_cache(test_corpus)
    curations = get_curations(pa_hash=hash_val)
    cur_dict = defaultdict(list)
    for cur in curations:
        cur_dict[(cur.pa_hash,
                  cur.source_hash)].append({'error_type': cur.tag})
    st_json = {}
    for test in tests:
        if str(test.stmt.get_hash()) == str(hash_val):
            st_json = test.stmt.to_json()
            ev_list = _format_evidence_text(
                test.stmt, cur_dict, ['correct', 'act_vs_amt', 'hypothesis'])
            st_json['evidence'] = ev_list
    return {'statements': {hash_val: st_json}}
def test_format_evidence_text():
    stmt = make_stmt()
    ev_list = _format_evidence_text(stmt)
    assert len(ev_list) == 1
    ev = ev_list[0]
    assert isinstance(ev, dict)
    assert set(ev.keys()) == {
        'source_api', 'text_refs', 'text', 'source_hash', 'pmid',
        'num_curations', 'num_correct', 'num_incorrect'
    }
    assert ev['source_api'] == 'test'
    assert ev['text_refs']['PMID'] == '1234567'
    assert ev['text'] == (
        'We noticed that the '
        '<span class="badge badge-subject">Src kinase</span> '
        'was able to phosphorylate '
        '<span class="badge badge-object">'
        'Ras proteins</span>.'), ev['text']
Exemple #5
0
    def decorator(*args, **kwargs):
        tracker = LogTracker()
        start_time = datetime.now()
        logger.info("Got query for %s at %s!" %
                    (get_db_query.__name__, start_time))

        web_query = request.args.copy()
        offs = _pop(web_query, 'offset', type_cast=int)
        ev_lim = _pop(web_query, 'ev_limit', type_cast=int)
        best_first = _pop(web_query, 'best_first', True, bool)
        max_stmts = min(_pop(web_query, 'max_stmts', MAX_STATEMENTS, int),
                        MAX_STATEMENTS)
        fmt = _pop(web_query, 'format', 'json')
        w_english = _pop(web_query, 'with_english', False, bool)
        w_cur_counts = _pop(web_query, 'with_cur_counts', False, bool)

        # Figure out authorization.
        has = dict.fromkeys(['elsevier', 'medscan'], False)
        if not TESTING:
            user, roles = resolve_auth(web_query)
            for role in roles:
                for resource in has.keys():
                    has[resource] |= role.permissions.get(resource, False)
            logger.info('Auths: %s' % str(has))
        else:
            web_query.pop('api_key', None)
            has['elsevier'] = False
            has['medscan'] = False

        # Actually run the function.
        logger.info("Running function %s after %s seconds." %
                    (get_db_query.__name__, sec_since(start_time)))
        db_query = get_db_query(web_query, *args, **kwargs)
        if isinstance(db_query, Response):
            return db_query
        elif not isinstance(db_query, QueryCore):
            raise RuntimeError("Result should be a child of QueryCore.")

        if ev_lim is None:
            if get_db_query is get_statement_by_hash:
                ev_lim = 10000
            else:
                ev_lim = 10

        if not has['medscan']:
            minus_q = ~HasOnlySource('medscan')
            db_query &= minus_q
            ev_filter = minus_q.ev_filter()
        else:
            ev_filter = None

        result = db_query.get_statements(offset=offs,
                                         limit=max_stmts,
                                         ev_limit=ev_lim,
                                         best_first=best_first,
                                         evidence_filter=ev_filter)

        logger.info("Finished function %s after %s seconds." %
                    (get_db_query.__name__, sec_since(start_time)))

        # Handle any necessary redactions
        res_json = result.json()
        stmts_json = res_json.pop('results')
        elsevier_redactions = 0
        source_counts = result.source_counts
        if not all(has.values()) or fmt == 'json-js' or w_english:
            for h, stmt_json in stmts_json.copy().items():
                if w_english:
                    stmt = stmts_from_json([stmt_json])[0]
                    stmt_json['english'] = _format_stmt_text(stmt)
                    stmt_json['evidence'] = _format_evidence_text(stmt)

                if has['elsevier'] and fmt != 'json-js' and not w_english:
                    continue

                if not has['medscan']:
                    source_counts[h].pop('medscan', 0)

                for ev_json in stmt_json['evidence'][:]:
                    if fmt == 'json-js':
                        ev_json['source_hash'] = str(ev_json['source_hash'])

                    # Check for elsevier and redact if necessary
                    if not has['elsevier'] and \
                            get_source(ev_json) == 'elsevier':
                        text = ev_json['text']
                        if len(text) > 200:
                            ev_json['text'] = text[:200] + REDACT_MESSAGE
                            elsevier_redactions += 1

        logger.info(f"Redacted {elsevier_redactions} pieces of elsevier "
                    f"evidence.")

        logger.info("Finished redacting evidence for %s after %s seconds." %
                    (get_db_query.__name__, sec_since(start_time)))

        # Get counts of the curations for the resulting statements.
        if w_cur_counts:
            curations = get_curations(pa_hash=set(stmts_json.keys()))
            logger.info("Found %d curations" % len(curations))
            cur_counts = {}
            for curation in curations:
                # Update the overall counts.
                if curation.pa_hash not in cur_counts:
                    cur_counts[curation.pa_hash] = 0
                cur_counts[curation.pa_hash] += 1

                # Work these counts into the evidence dict structure.
                for ev_json in stmts_json[curation.pa_hash]['evidence']:
                    if str(ev_json['source_hash']) == str(
                            curation.source_hash):
                        ev_json['num_curations'] = \
                            ev_json.get('num_curations', 0) + 1
                        break
            res_json['num_curations'] = cur_counts

        # Add derived values to the res_json.
        res_json['offset'] = offs
        res_json['evidence_limit'] = ev_lim
        res_json['statement_limit'] = MAX_STATEMENTS
        res_json['statements_returned'] = len(stmts_json)
        res_json['end_of_statements'] = (len(stmts_json) < MAX_STATEMENTS)
        res_json['statements_removed'] = 0
        res_json['evidence_returned'] = result.returned_evidence

        if fmt == 'html':
            title = TITLE + ': ' + 'Results'
            ev_totals = res_json.pop('evidence_totals')
            stmts = stmts_from_json(stmts_json.values())
            html_assembler = HtmlAssembler(stmts,
                                           res_json,
                                           ev_totals,
                                           source_counts,
                                           title=title,
                                           db_rest_url=request.url_root[:-1])
            idbr_template = env.get_template('idbr_statements_view.html')
            identity = user.identity() if user else None
            content = html_assembler.make_model(idbr_template,
                                                identity=identity)
            if tracker.get_messages():
                level_stats = [
                    '%d %ss' % (n, lvl.lower())
                    for lvl, n in tracker.get_level_stats().items()
                ]
                msg = ' '.join(level_stats)
                content = html_assembler.append_warning(msg)
            mimetype = 'text/html'
        else:  # Return JSON for all other values of the format argument
            res_json.update(tracker.get_level_stats())
            res_json['statements'] = stmts_json
            res_json['source_counts'] = source_counts
            content = json.dumps(res_json)
            mimetype = 'application/json'

        resp = Response(content, mimetype=mimetype)
        logger.info("Exiting with %d statements with %d/%d evidence of size "
                    "%f MB after %s seconds." %
                    (res_json['statements_returned'],
                     res_json['evidence_returned'], res_json['total_evidence'],
                     sys.getsizeof(resp.data) / 1e6, sec_since(start_time)))
        return resp
Exemple #6
0
    def process_entries(self, result):
        if result.result_type == 'hashes':
            # There is really nothing to do for hashes.
            return

        elsevier_redactions = 0
        if not all(self.has.values()) or self.fmt == 'json-js' \
                or self.w_english:
            for key, entry in result.results.copy().items():
                # Build english reps of each result (unless their just hashes)
                if self.w_english and result.result_type != 'hashes':
                    stmt = None
                    # Fix the agent order
                    if self.strict:
                        if result.result_type == 'statements':
                            stmt = stmts_from_json([entry])[0]
                            if type(stmt) == Complex:
                                id_lookup = {v: int(k)
                                             for k, v in self.agent_dict.items()}
                                stmt.members.sort(
                                    key=lambda ag: id_lookup.get(ag.name, 10)
                                )
                            agent_set = {ag.name
                                         for ag in stmt.agent_list()
                                         if ag is not None}
                        else:
                            agent_set = set(entry['agents'].values())
                            if result.result_type == 'relations' \
                                    and entry['type'] == 'Complex':
                                entry['agents'] = self.agent_dict
                        if agent_set < self.agent_set:
                            result.results.pop(key, None)
                            continue

                    # Construct the english.
                    if result.result_type == 'statements':
                        if stmt is None:
                            stmt = stmts_from_json([entry])[0]
                        eng = _format_stmt_text(stmt)
                        entry['evidence'] = _format_evidence_text(stmt)
                    else:
                        eng = _make_english_from_meta(entry['agents'],
                                                      entry.get('type'))
                    if not eng:
                        logger.warning(f"English not formed for {key}:\n"
                                       f"{entry}")
                    entry['english'] = eng

                # Filter out medscan if user does not have medscan privileges.
                if not self.has['medscan']:
                    if result.result_type == 'statements':
                        result.source_counts[key].pop('medscan', 0)
                    else:
                        result.evidence_counts[key] -= \
                            entry['source_counts'].pop('medscan', 0)
                        entry['total_count'] = result.evidence_counts[key]
                        if not entry['source_counts']:
                            logger.warning("Censored content present.")

                # In most cases we can stop here
                if self.has['elsevier'] and self.fmt != 'json-js' \
                        and not self.w_english:
                    continue

                if result.result_type == 'statements':
                    # If there is evidence, loop through it if necessary.
                    for ev_json in entry['evidence'][:]:
                        if self.fmt == 'json-js':
                            ev_json['source_hash'] = str(ev_json['source_hash'])

                        # Check for elsevier and redact if necessary
                        if not self.has['elsevier'] and \
                                get_source(ev_json) == 'elsevier':
                            text = ev_json['text']
                            if len(text) > 200:
                                ev_json['text'] = text[:200] + REDACT_MESSAGE
                                elsevier_redactions += 1
                elif result.result_type != 'hashes' and self.fmt == 'json-js':
                    # Stringify lists of hashes.
                    if 'hashes' in entry and entry['hashes'] is not None:
                        entry['hashes'] = [str(h) for h in entry['hashes']]
                    elif 'hash' in entry:
                        entry['hash'] = str(entry['hash'])

        if result.result_type == 'statements':
            logger.info(f"Redacted {elsevier_redactions} pieces of elsevier "
                        f"evidence.")

        logger.info(f"Process entries for {self.__class__.__name__} after "
                    f"{sec_since(self.start_time)} seconds.")
        return