コード例 #1
0
ファイル: tweet_credrev.py プロジェクト: expertailab/acred
def extract_relevant_sentences(tweet, cfg):
    """Extracts all relevant sentences

    :param tweet:
    :param cfg: configuration map

    :returns: dict with keys `in_tweet` and `in_linked_doc` The first
    returns a list of sentences extracted from the tweet text proper,
    possibly cleaned. `in_linked_doc` has a list of sentences in
    documents linked by the tweet.

    :rtype: dict
    """
    start = citimings.start()
    sent_detector = tweetsents.build_sent_detector(cfg)
    sent_detector_t = citimings.timing("sent_detector", start)

    start2 = citimings.start()
    sents = sent_detector(tweet['text'])
    sent_detection_t = citimings.timing('sent_detection', start2)

    in_tweet = tweetsents.build_in_tweet_info(tweet['tweet_id'], sents, cfg)

    # TODO  1.4 if there are complex sentences, extract clauses
    return {
        'in_tweet':
        in_tweet,
        'timings':
        citimings.timing('tweet_relevant_sentences', start,
                         [sent_detector_t, sent_detection_t])
    }
コード例 #2
0
def search_claim(q_claim):
    """finds similar claims or sentences in a claim database

    Finding similar claims or sentences in the co-inform claim database # noqa: E501

    :param q_claim: This should be an English sentence or claim. Multiple sentences are not allowed.
    :type q_claim: str
    :rtype: dict
    """
    if type(q_claim) is str:
        q_claims = [q_claim]
    if type(q_claim) is list:
        q_claims = q_claim
    if q_claim is None:
        raise InvalidUsage("Claim is mandatory")
    start = citimings.start()
    logger.info('Searching semantic vector space for %s claim(s)' % len(
        q_claims))
    topn = 5
    preds, claim_ids, simReviewer = search_semantic_vecspace(q_claims, topn=topn)
    search_semspace_t = citimings.timing('search_semantic_vecspace', start)

    assert len(preds) == len(claim_ids)
    assert len(q_claims) == len(preds)
    q_resp, claim_retrieve_t = retrieve_result_claims(claim_ids, q_claims, topn)

    start3 = citimings.start()
    results, sub_build_ts = [], []
    for i in range(len(q_claims)):
        start4 = citimings.start()
        claim_id2pred = {idx: float(pred) for idx, pred in zip(
            claim_ids[i], preds[i])}
        relsents, sub_ts = q_resp_to_related_sent(
            q_resp, claim_id2pred)
        qclaim = q_claims[i]
        results.append({
            '@context': ci_context,
            '@type': 'SemanticClaimSimilarityResult',
            'dateCreated': isodate.now_utc_timestamp(),
            'q_claim': qclaim,
            'simReviewer': simReviewer,
            'results': relsents})
        sub_build_ts.append(citimings.timing('build_result', start4, sub_ts))
    result_build_t = citimings.timing('build_results', start3, sub_build_ts)

    results, stance_pred_t = add_stance_detection(
        results, sim_threshold=stance_min_sim_threshold)

    timing = citimings.timing(
        'search_claim', start,
        [search_semspace_t, claim_retrieve_t,
         result_build_t, stance_pred_t])
    return {
        'results': results,
        'resultsHeader': {
            'QTime': timing['total_ms'],
            'timings': timing,
            'params': {
                'claim': q_claim
            }}}
コード例 #3
0
ファイル: tweetrelsents.py プロジェクト: expertailab/acred
def analyzed_doc(url, cfg):
    logger.info("Scraping " + url)
    start = citimings.start()
    scraped = url_scraper.scrape(url)
    scraped_t = citimings.timing('url_scraping', start)

    resolved_url = scraped['resolved_url']
    # resp2 = requests.get(url)
    # resolved_url = resp2.url
    # logger.info("Resolved to " + resolved_url)
    # url_html = resp2.text
    start2 = citimings.start()
    ci_collections = cfg.get(
        'relsents_in_colls',
        ['pilot-se', 'pilot-gr', 'pilot-at', 'factcheckers', 'fc-dev'])
    assert cisearch_available
    preidx_doc = cisearch.find_preindexed_doc_by_url(resolved_url, ci_collections)
    preidx_doc_t = citimings.timing('retrieve_preindexed', start2)
    if preidx_doc is not None:
        logger.info(
            'Found previously analyzed doc with %s claims and keys %s' % (
                len(preidx_doc.get('claims_content', [])),
                list(preidx_doc.keys())
            ))
        preidx_doc['timings'] = citimings.timing(
            'analyzed_doc', start, [scraped_t, preidx_doc_t])
        return preidx_doc
    logger.info('Document not in existing indices, analysing...')
    start3 = citimings.start()
    result = do_analyze_doc(scraped, cfg)
    do_analyze_t = citimings.timing('semantic_analysis', start3)
    result['timings'] = citimings.timing(
        'analyzed_doc', start, [scraped_t, preidx_doc_t, do_analyze_t])
    return result
コード例 #4
0
ファイル: tweet_credrev.py プロジェクト: expertailab/acred
def review_sents_in_tweet(tweet, cfg):
    """Extracts sentenes in `tweet` and reviews their credibilities

    :param tweet: a `Tweet` dict
    :param cfg: config options

    :returns: a tuple with a list of credibility reviews and a list of
      timings for the steps. The review format depends on
      `cfg['acred_review_format']`

    :rtype: tuple
    """
    relevant_sentences = extract_relevant_sentences(tweet, cfg)
    relevant_sentences_t = relevant_sentences.get('timings', None)

    start2 = citimings.start()
    intws = relevant_sentences['in_tweet']
    logger.info("Found %d relevant sentences in tweet" % len(intws))
    review_format = cfg.get('acred_review_format', 'schema.org')
    if review_format == 'cred_assessment':
        sent_reviews = aggqsent_credrev.calc_claim_cred(
            [itw['text'] for itw in intws], cfg)
    else:
        sent_reviews = aggqsent_credrev.review([
            content.as_sentence(itw['text'], appearance=[tweet], cfg=cfg)
            for itw in intws
        ], cfg)
    sents_in_tweet_t = citimings.timing('sents_in_tweet', start2)
    return sent_reviews, [relevant_sentences_t, sents_in_tweet_t]
コード例 #5
0
ファイル: website_credrev.py プロジェクト: expertailab/acred
def calc_domain_credibility(domain, cfg={}):
    """Calculates a `DomainCredibility` for a domain via MisinfoMe

    Note that `DomainCredibility` is deprecated, use the `review` method 
    which produces a `WebSiteCredReview` instead.

    :param domain: str e.g. `www.snopes.com`
    :returns: a `DomainCredibility`
    :rtype: dict
    """
    if domain is None:
        return default_domain_crediblity(
            domain, "Default credibility for unknown domain")
    else:
        assert type(domain) == str, 'Expecting str, but was %s' (type(domain))
        start = citimings.start()
        try:
            return {
                **misinfome_source_credibility(domain), '@context':
                'DomainCredibility',
                '@type': 'DomainCredibility',
                'dateCreated': isodate.now_utc_timestamp(),
                'timings': citimings.timing('misinfome_source_credibility',
                                            start)
            }
        except Exception as e:
            logger.error("Failed misinfome source credibility. " + str(e))
            return default_domain_crediblity(
                domain, "Unable to retrieve credibility assessment")
コード例 #6
0
def as_related_sent_or_claimReview(db_claim_doc, claimid2pred):
    start = citimings.start()
    multival_separator = ','
    doc_urls = db_claim_doc.get('urls_ss', '').split(multival_separator)
    domains = db_claim_doc.get('domains_ss', '').split(multival_separator)
    domain = None
    if len(domains) == 0:
        # logger.warn("Claim doc is missing domains_ss")
        if len(doc_urls) > 0:
            domain = content.domain_from_url(doc_urls[0])
    else:
        domain = domains[0]

    return {
        '@context': ci_context,
        '@type': 'SimilarSent',
        'sentence': db_claim_doc['content_t'],
        'similarity': claimid2pred.get(db_claim_doc['id'], 0.5),
        'doc_url': None if len(doc_urls) == 0 else doc_urls[0],
        'appearance': doc_urls,
        'lang_orig': db_claim_doc.get('lang_s', None),
        'published_date': db_claim_doc.get(
            'published_dts', [None])[0] or db_claim_doc.get('schema_org_cr_itemReviewed_datePublished_tdt', None),
        'domain': domain,
        'claimReview': lookup_claimReview_url(db_claim_doc['schema_org_cr_url'], claimReview_db)
    }, citimings.timing('as_related_sent', start,
                        [])
コード例 #7
0
ファイル: tweet_credrev.py プロジェクト: expertailab/acred
def review_linked_docs_in_tweet(tweet, cfg):
    """Review the credibility of any docs linked in the tweet

    :param tweet: a `Tweet` dict. We expect it to have a field `urls`
      with a list of URL objects
    :param cfg: 
    :returns: a tuple with (i) a list of credibility reviews for the
      webpages linked in tweet and (ii) a timing object for this method
    :rtype: tuple
    """
    start3 = citimings.start()
    # Retrieve or request doc credibilities
    doc_urls = [url['short_url'] for url in tweet['urls']]
    doc_urls = list(set(doc_urls))  # dedupe
    # TODO retrieve existing credibility from DB or
    # calculate from scratch
    docs = [{
        '@context': 'http://schema.org',
        '@type': 'Webpage',
        'url': url,
        'mentioned_in': tweet
    } for url in doc_urls]
    doc_creds = [article_credrev.review(doc, cfg) for doc in docs]
    doc_creds_t = citimings.timing(
        'sub_doc_cred', start3,
        [dc['timings'] for dc in doc_creds if 'timings' in dc])
    return doc_creds, doc_creds_t
コード例 #8
0
ファイル: views.py プロジェクト: expertailab/acred
def predict_stance():
    try:
        start = citimings.start()
        req_json = request.get_json()
        inputs = []
        if type(req_json) == list:
            for claim_bods in req_json:
                qclaim = claim_bods['qclaim']
                doc_bodies = claim_bods['doc_bodies']
                validate_stance_pred_q(qclaim, doc_bodies)
                inputs.extend([(qclaim, docbod) for docbod in doc_bodies])
        else:  # assume single input
            qclaim = req_json['qclaim']
            doc_bodies = req_json['doc_bodies']
            validate_stance_pred_q(qclaim, doc_bodies)
            inputs.extend([(qclaim, docbod) for docbod in doc_bodies])

        tokmodmeta = resources.stance_tokmodmeta
        if len(inputs) == 0:
            return jsonify({
                'labels': [],
                'confidences': [],
                'meta': {
                    'model_info': tokmodmeta['model_info'],
                    'timings': citimings.timing('predict_stance', start),
                    'n_pairs': len(inputs)
                }
            })
        labels, confs = stancepred.predict_stances(tokmodmeta, inputs)
        return jsonify({
            'labels': labels,
            'confidences': confs,
            'meta': {
                'model_info': tokmodmeta['model_info'],
                'timings': citimings.timing('predict_stance', start),
                'n_pairs': len(inputs)
            }
        })
    except werkzeug.exceptions.BadRequest as e:
        logger.exception(e)
        return 'bad request! ' + str(e), 400
    except Exception as e:
        logger.exception(e)
        resp = jsonify({"error": str(e)})
        resp.status_code = 500
        return resp
コード例 #9
0
def claimsim_result_as_claimcred(claimsim_result, cfg):
    """Convert a `SemanticClaimSimilarityResult` into a `ClaimCredibility`

    :param claimsim_results: 
    :param cfg: 
    :returns: 
    :rtype: 
    """
    # TODO: delegate to reviewers to convert claimsim_result into
    # QSentCredReview, DBClaimCredibilityReview, WebSiteCredReview, etc.
    agg_start = citimings.start()
    qsent = claimsim_result['q_claim']  # qsent
    relsents = claimsim_result['results']  # simsents

    # sentSimReviews = [ # TODO: remove, just for feedback during refactoring
    #     semsent_simrev.similarSent_as_SentSimilarityReview(simSent, claimsim_result, cfg)
    #     for simSent in relsents]

    for rs in relsents:
        # claim search no longer does domain credibility, so we have to do it here
        if 'domain_credibility' not in rs:
            rs['domain_credibility'] = website_credrev.calc_domain_credibility(
                rs['domain'])

    relsents = [add_relative_credibility(rs, cfg) for rs in relsents]
    cred_dict = aggregate_credibility(relsents, cfg)
    cred_dict['source'] = 'credibility of %d related claims ' % len(relsents)
    agg_t = citimings.timing('claim_relsent_agg', agg_start)
    return {
        '@context':
        ci_context,
        '@type':
        'ClaimCredibility',
        'claim':
        qsent,
        'item_assessed': {
            '@context': ci_context,
            '@type': 'Claim',
            'claim': qsent
        },
        # 'sentenceSimilarityReview': sentSimReviews,
        'aggQSentCredReview':
        claimsim_result_as_aggQSentCredReview(claimsim_result, cfg),
        'related_claims':
        _partition_related_sents(relsents, cfg),
        'date_assessed':
        isodate.now_utc_timestamp(),
        'assessor': {
            '@context': ci_context,
            '@type': 'CredibilityAssessor',
            'name': 'SemanticSimilarityClaimCredibilityAssessor',
            'version': '20200208'
        },
        'credibility':
        cred_dict,
        'timings':
        agg_t
    }
コード例 #10
0
def retrieve_result_claims(claim_ids, q_claims, topn):
    claim_id_set = set(np.array(claim_ids).flatten())
    logger.info("Top %d claims for %d query sents resulted in %d claims" % (
        topn, len(q_claims), len(claim_id_set)))

    start = citimings.start()
    dbdocs = find_in_dbs(dbs=[preCrawled_sents_db, claimReviewed_sents_db], q_ids=list(claim_id_set))
    q_resp = {'response': {'docs': dbdocs}} 
    claim_retrieve_t = citimings.timing('retrieve_claims', start)
    return q_resp, claim_retrieve_t
コード例 #11
0
def do_assess_doc_content_cred(adoc, cfg):
    start = citimings.start()
    claims_in_doc = select_claims_in_doc(adoc, cfg)
    claim_creds = aggqsent_credrev.calc_claim_cred(
        [claim['text'] for claim in claims_in_doc], cfg)
    logger.info("Found %d doc subclaim credibilities" % len(claim_creds))
    content_cred = aggregate_sub_creds(claim_creds, 'document', cfg)
    content_cred['timings'] = citimings.timing(
        'assess_doc_content_cred', start,
        [cc.get('timings', None) for cc in claim_creds])
    return content_cred
コード例 #12
0
def calc_claim_cred(sents, cfg):
    """Produces ClaimCredibilityAssessments for a list of sents

    :param sents: list of input sentences (assumed to be claims)
    :param cfg: config parameters
    :returns: a list of coinform `ClaimCredibility` assessments
    :rtype: list
    """
    start = citimings.start()
    claimsim_results = claimsim.find_related_sentences(sents, cfg)
    relsents_t = citimings.timing('find_relsents', start)

    result = [
        claimsim_result_as_claimcred(csr, cfg) for csr in claimsim_results
    ]
    for claimcred in result:  # include search timings in results
        agg_t = claimcred['timings']
        claimcred['timings'] = citimings.timing('claimcred', start,
                                                [relsents_t, agg_t])

    return result
コード例 #13
0
ファイル: predictor.py プロジェクト: expertailab/acred
def assess_doc_cred(doc, cfg):
    """Main credibility assessment for a single doc

    :param doc: a validated and normalised document, ready for credibility
      assessment
    :param cfg: any configs we need to execute/customise the assessment
    :returns: a credibility assessment for the doc
    :rtype: dict
    """
    start = citimings.start()
    if content.is_tweet_doc(doc):
        result = tweet_credrev.review(doc, cfg)
        return result
    elif content.is_article_doc(doc):
        result = article_credrev.review(doc, cfg)
        return result
    else:
        rev_format = cfg.get('acred_review_format', 'schema.org')
        msg = 'Unsupported document (not a %s))' % supported_doc_types
        if rev_format == 'cred_assessment':
            return {
                '@context': ci_context,
                '@type': 'DocumentCredibilityAssessment',
                'doc_url': doc['url'],
                'item_assessed': doc,
                'cred_assessment_error': msg,
                'date_assessed': isodate.now_utc_timestamp(),
                'timings': citimings.timing('assess_doc_cred', start),
                'credibility': 0,
                'confidence': 0,
                'explanation': msg}
        else:
            rating = {
                '@type': 'Rating',
                'ratingValue': 0.0,
                'confidence': 0.0,
                'ratingExplanation': msg}
            result = {
                '@context': ci_context,
                '@type': 'DocumentCredReview',
                'reviewAspect': 'credibility',
                'itemReviewed': doc,
                'dateCreated': isodate.now_utc_timestamp(),
                'author': bot_info([], cfg),
                'reviewRating': {
                    **rating,
                    'identifier': itnorm.calc_identifier(rating, cfg)}
            }
            return {
                **result,
                'identifier': itnorm.calc_identifier(result, cfg)
            }
コード例 #14
0
def assess_article_cred(article, cfg):
    """Main credibility assessment for a single article

    *Deprecated* you should move to `review_article`

    :param article: valid and normalised article
    :param cfg: config to guide this assessment
    :returns: a credibility assessment for the article
    :rtype: dict
    """
    start = citimings.start()

    adoc = analyzed_doc(article, cfg)
    adoc_t = adoc['timings']

    domcred = adoc_to_domain_cred(adoc, cfg)
    content_cred = assess_doc_content_cred(adoc, cfg)

    agg_cred = aggregate_article_cred(domcred, content_cred, cfg)

    return {
        '@context':
        content.ci_context,
        '@type':
        'ArticleCredibilityAssessment',
        'doc_url':
        article['url'],
        'item_assessed':
        article,
        'date_asessed':
        isodate.now_utc_timestamp(),
        'assessor': {
            '@context': content.ci_context,
            '@type': 'CredibilityAssessor',
            'name': 'ArticleCredibilityAssessor',
            'version': '20200207'
        },
        'doc_resolved_url':
        adoc.get('resolved_url', adoc.get('url')),
        'analyzed_doc':
        adoc,
        **agg_cred, 'sub_assessments': [domcred, content_cred],
        'timings':
        citimings.timing('assess_article_cred', start, [
            adoc_t,
            domcred.get('timings', None),
            content_cred.get('timings', None)
        ])
        # 'claims_in_doc': claim_creds,
        # 'domain_credibility': domcred,
        # 'content_credibility': content_cred
    }
コード例 #15
0
ファイル: test_citimings.py プロジェクト: expertailab/acred
def f_composite():
    start = citimings.start()
    sleep(0.01)
    f_out = f()
    g_out = g()
    result = {
        **f_out,
        **g_out, 'timings':
        citimings.timing('f_composite', start, [
            p['timings']
            for p in [f_out, g_out] if p and type(p) is dict and 'timings' in p
        ])
    }
    return result
コード例 #16
0
ファイル: website_credrev.py プロジェクト: expertailab/acred
def default_domain_crediblity(domain, explanation):
    start = citimings.start()
    return {
        "credibility": {
            '@context': ci_context,
            '@type': 'DomainCredibility',
            'item_assessed': domain,
            "value": 0.0,  # in range [-1, 1]
            "confidence": 0.0,
            "explanation": explanation,
            'timings': citimings.timing('default_domain_crediblity', start)
        },
        "assessments": []
    }
コード例 #17
0
ファイル: tweet_credrev.py プロジェクト: expertailab/acred
def assess_tweet_cred(tweet, cfg):
    """Main credibility assessment for a single tweet

    *Deprecated* use `review_tweet` instead

    :param tweet: must have field `text` with the textual content,
      and field `urls` with shortened urls appearing in the original tweet
      content
    :returns: a credibility assessment for the tweet
    :rtype: dict
    """
    start = citimings.start()

    rev_format = cfg.get('acred_review_format', 'schema.org')
    assert rev_format == 'cred_assessment', rev_format

    # generate (or retrieve) sub reviews
    sents_in_tweet, subts = review_sents_in_tweet(tweet, cfg)
    doc_creds, doc_creds_t = review_linked_docs_in_tweet(tweet, cfg)

    start4 = citimings.start()
    tweet_cred = aggregate_tweet_cred(sents_in_tweet, doc_creds, cfg)
    result = {
        '@context': ci_context,
        '@type': 'TweetCredibilityAssessment',
        'tweet_id': int(tweet['tweet_id']),
        'sub_assessments': sents_in_tweet + doc_creds,
        'item_assessed': tweet,
        'sentences_in_tweet': sents_in_tweet,
        **tweet_cred
    }
    result = remove_tweet_assessment_details(result)
    agg_t = citimings.timing('aggregation_cleaning_time', start4)

    subts += [doc_creds_t, agg_t]
    result['timings'] = citimings.timing('assess_tweet_cred', start, subts)
    return result
コード例 #18
0
def analyzed_doc(article, cfg):
    """Returns an analysed version for an input article

    :param article: an `Article` item, really anything with fields `url`,
      `content` and `id`. See `semantic_analyzer.analyzer.analyze_doc`.
    :param cfg: config options
    :returns: an analyzed doc. Crucially, it will contain a field `claims_content`.
      See `semantic_analyzer.analyzer.analyze_doc` for basic analysed doc.
    :rtype: dict
    """
    start = citimings.start()
    ci_colls = cfg.get('relsents_in_colls', [
        'generic', 'pilot-se', 'pilot-gr', 'pilot-at', 'factcheckers', 'fc-dev'
    ])
    preidx_doc = gcssearch.find_preindexed_doc_by_url(article['url'], ci_colls)
    if preidx_doc is None:
        fetched = url_scraper.fetch_url(article['url'])
        resolved_url = fetched['resolved_url']
        if resolved_url != article['url']:
            preidx_doc = gcssearch.find_preindexed_doc_by_url(
                resolved_url, ci_colls)
            # TODO: we may want to add the article['url'] as an alias
            #  for this, the DB schema needs to support this and we
            #  need to be able to submit new values for this list
            #  of url values. Define `same_as_ss` and update
            #  gcsearch to query and update this.
    preidx_t = citimings.timing('retrieve_preindexed', start)
    if preidx_doc is not None:
        preidx_doc['timings'] = preidx_t
        return preidx_doc
    else:
        adoc = semalyzer.analyze_doc(article, {**cfg, 'expand_claims': True})
        analyze_subt = adoc.get('timings')
        adoc['timings'] = citimings.timing('analyzed_doc', start,
                                           [preidx_t, analyze_subt])
        return adoc
コード例 #19
0
def predict_worthiness():
    try:
        tokmodmeta = resources.worthiness_tokmodmeta
        start = citimings.start()
        req_json = request.get_json()
        q_sents = req_json['sentences']
        if q_sents is None:
            raise ValueError("sentences parameter is mandatory, only got %s" %
                             (req_json))
        if type(q_sents) is str:
            q_sents = [q_sents]
        if type(q_sents) is not list:
            raise ValueError(
                "Type %s not accepted. Valid formats: string or list" %
                type(req_json['sentences']))

        if len(q_sents) == 0:
            label, conf, ids = [], [], []
        else:
            label, conf = worthinesspred.cw_pred_batched(tokmodmeta, q_sents)
            logger.debug('predicted %s labels and %s confidences' %
                         (len(label), len(conf)))
            ids = [hashu.calc_str_hash(ids) for ids in q_sents]

        return jsonify({
            'worthiness_checked_sentences': {
                'sentences': q_sents,
                'predicted_labels': label,
                'prediction_confidences': conf,
                'sentence_ids': ids,
            },
            'meta': {
                'model_info': tokmodmeta['model_info'],
                'timings': citimings.timing('predict_worthiness', start),
            }
        })

    except werkzeug.exceptions.BadRequest as e:
        logger.exception(e)
        return 'bad request! ' + str(e), 400
    except Exception as e:
        logger.exception(e)
        resp = jsonify({"error": str(e)})
        resp.status_code = 500
        return resp
コード例 #20
0
def search_claim_bots():
    """Returns a map describing the bots involved in `search_claim`

    :returns: a map describing the bots involved in `search_claim`
    :rtype: dict
    """
    start = citimings.start()
    bots = {
        'simReviewer': simReviewer(), # includes the sentence encoder bot!
        'stancePred': stancePredictor()}
    timing = citimings.timing('search_claim_bots', start)
    return {
        'results': [], # no similar sentence results
        'bots': bots,
        'resultsHeader': {
            'QTime': timing['total_ms'],
            'timings': timing,
            'params': {}}}    
コード例 #21
0
def analyze_doc(doc, cfg):
    """Semantically analyses a partial `doc` and outputs a
    document similar to those in AW Solr

    :param doc: dict that must contain at least fields `content` and
      `id`, optional but recommended fields: `title` and various
      metadata fields about where the document comes from and how it
      was processed up to this point.

    :param cfg: any configuration to influence how we analyze the doc.
      In particular, this should tell us about AW services we can
      reuse to perform the analysis such as an available AW
      semantic-api endpoint. We assume that this endpoint will be
      suitable for the language of the content.

    :returns: an analyzed doc that aims to be compatible with the
      standard AW Solr schema.  In particular, the output doc should
      combine the fields in the input doc with fields from semantic
      analysis such as categorization fields `taxonomy_x_tax`, entity
      fields `y_ss`, fact fields `fact_*_tax` However, **if you want
      full compatibility, you should perform a final check** based on
      the Solr schema in order to avoid adding fields by mistake.

    :rtype: dict
    """
    assert type(doc) is dict, str(type(doc))
    if 'content' not in doc:
        assert 'url' in doc, 'Expecting at least a url to resolve doc'
        scraped = url_scraper.scrape(doc['url'])
        doc = {**doc, **scraped}

    start = citimings.start()
    doc = try_translate(doc, cfg)

    analyzer_fn = get_analyzer_fn(cfg)
    sem_analysis = analyzer_fn(doc['content'], doc['title'], cfg)
    result = merge_semantic_analysis(doc, sem_analysis)
    timing = citimings.timing('elaboration', start)
    result['elaboration_elapsedtime'] = int(timing['total_ms'])
    if cfg.get('expand_claims', False):
        import semantic_analyzer.claim_content_expander as cce
        result['claims_content'] = cce.calc_claim_content(result, cfg)
    return result
コード例 #22
0
def add_stance_detection(claim_sim_results, sim_threshold=0.7):
    """Adds `doc_content` and `*_stance` fields to the input sim_results

    :param claim_sim_results: list of ClaimSimilarityResults
    :param sim_threshold: only perform stance detection for match results
      that are more similar than this value. Useful since stance detection is
      fairly slow.
    :returns: a modified `claim_sim_results` and timings
    :rtype: tuple
    """
    start = citimings.start()

    sub_ts = []
    start2 = citimings.start()
    result, stance_timing = do_add_stance_labels(
        claim_sim_results, sim_threshold=sim_threshold)
    sub_ts.append(stance_timing)
    stance_pred_t = citimings.timing('stance_pred', start2, sub_ts)
    return result, stance_pred_t
コード例 #23
0
ファイル: predictor.py プロジェクト: expertailab/acred
def dummyPrediction(tweet):
    start = citimings.start()
    return {
        '@context': ci_context,
        '@type': 'TweetCredibilityAssessment',
        'tweet_id': int(tweet['tweet_id']),
        'item_assessed': tweet,
        'credibility': random.random(),
        'confidence': 0.0,
        'explanation': 'Dummy prediction, no actual analysis performed.',
        'sub_assessments': [],
        'date_assessed': isodate.now_utc_timestamp(),
        'assessor': {'@context': ci_context,
                     'name': 'dummyCredibilityPredictor'},
        'timings': citimings.timing('dummyPrediction', start)
        # deprecated, now as sub_assessments
        # 'sentences_in_tweets': [],
        # 'sentences_linked': []
    }
コード例 #24
0
def q_resp_to_related_sent(q_resp, claimid2pred):
    start = citimings.start()
    docs = q_resp['response']['docs']

    def dbdoc2_resp_doc(doc):
        return as_related_sent_or_claimReview(doc, claimid2pred)
    
    # we are only interested in documents that appear in claimid2pred
    # otherwise these may be results for a differnt q_claim
    docs4claim = [doc for doc in docs if doc['id'] in claimid2pred]
    logger.info("Found %d (of %d) claims" % (
        len(docs4claim), len(claimid2pred)))
    if len(claimid2pred) != len(docs4claim):
        logger.warn("Expecting %d docs, but found %d.\n%s" % (
            len(claimid2pred), len(docs4claim), str(claimid2pred)))
    result_and_timings = [dbdoc2_resp_doc(doc) for doc in docs4claim]
    sub_ts = [rt[1] for rt in result_and_timings]
    result = [rt[0] for rt in result_and_timings]
    result = sorted(result, key=lambda doc: doc['similarity'], reverse=True)
    return result, citimings.timing('doc_as_relsent', start, sub_ts)
コード例 #25
0
ファイル: tweetrelsents.py プロジェクト: expertailab/acred
def build_in_linked_info(tweetID, urls, cfg):
    # TODO: refactor
    # why extract claims here? let predictor assess credibility of doc
    # so no need to extract sentences here
    in_linked_doc = []
    for url in urls:
        start = citimings.start()
        try:
            adoc = analyzed_doc(url, cfg)
            adoc_t = adoc['timings']
            claims_in_doc = list(gen_claims_from_analysed_doc(adoc, cfg))
            logger.info('Extracted %d claims from url' % len(claims_in_doc))
            claims_in_doc = [
                {**claim,
                 'url_in_tweet': url,
                 'timings': citimings.timing(
                     'url_in_tweet_claim_extraction', start, [adoc_t]),
                 'linked_by_tweet': tweetID}
                for claim in claims_in_doc]
            in_linked_doc.extend(claims_in_doc)
        except Exception as e:
            print("Unresolved url: ", url, "\n", str(e))
            raise e
    return in_linked_doc
コード例 #26
0
ファイル: test_citimings.py プロジェクト: expertailab/acred
def f():
    start = citimings.start()
    sleep(0.05)
    result = {'a': 'b', 'c': 'd', 'timings': citimings.timing('f', start)}
    return result
コード例 #27
0
ファイル: test_citimings.py プロジェクト: expertailab/acred
def g():
    start = citimings.start()
    sleep(0.1)
    return {'e': 'f', 'g': 'h', 'timings': citimings.timing('g', start)}
コード例 #28
0
def do_add_stance_labels(claim_sim_results, sim_threshold=0.7, max_len=128):
    start = citimings.start()

    def trim(s, max_len):
        s_toks = s.split(' ')
        if len(s_toks) > max_len:
            return ' '.join(s_toks[:max_len])
        else:
            return s
    
    stance_reqs = []
    for cresult in claim_sim_results:
        q_claim = cresult['q_claim']
        q_claim_toks = q_claim.split(' ')
        if len(q_claim_toks) > (2*max_len/3):
            logger.warning('Skip stance_pred: q_claim is too large %d ' % (
                len(q_claim_toks)))
            continue
        bods, rs_targets = [], []
        for rs in cresult['results']:
            if rs['similarity'] < sim_threshold:
                continue
            # docbod = rs.get('doc_content', None)
            # if docbod is not None:
            #     bods.append(trim(docbod, max_len))
            #     rs_targets.append({"rs": rs,
            #                        'field': 'doc_stance'})
            sent = rs.get('sentence', None)
            if sent is not None:
                bods.append(sent)
                rs_targets.append({'rs': rs,
                                   'field': 'sent_stance'})
        if len(bods) > 0:
            stance_reqs.append({
                'qclaim': q_claim,
                'doc_bodies': bods,
                'rs_targets': rs_targets})

    if len(stance_reqs) == 0:
        return claim_sim_results, citimings.timing(
            'predict_stances', start)

    labels, confs, stanceRev = predict_stances(
        # don't send the rs_targets to server
        [dictu.select_keys(sr, ['qclaim', 'doc_bodies'])
         for sr in stance_reqs])

    for csr in claim_sim_results:
        csr['stanceReviewer'] = stanceRev
    stance_docs_t = citimings.timing('doc_stance_pred', start)
    logger.info("Predicted stances %s with scores %s" % (labels, confs))

    rs_targets = [rs_target for req in stance_reqs
                  for rs_target in req['rs_targets']]
    assert len(rs_targets) == len(labels)
    assert len(confs) == len(labels)
    for rs_target, label, conf in zip(rs_targets, labels, confs):
        rs = rs_target['rs']
        field = rs_target['field']
        rs[field] = label
        rs['%s_confidence' % field] = conf


    return claim_sim_results, citimings.timing(
        'predict_stances', start,
        [stance_docs_t])