def author_name(claimReview, defValue="unknown author"): name = dictu.get_in(claimReview, ['author', 'name']) if name is None: url = dictu.get_in(claimReview, ['author', 'url']) name = content.domain_from_url(url) if name.startswith('www.'): name = name.replace('www.', '') if name.endswith('.com'): name = name.replace('.com', '') return name or defValue
def from_old_DomainCredibility(dom_cred, cfg): """Converts a `DomainCredibility` into a `WebSiteCredReview` :param dom_cred: a `DomainCredibility` dict :param cfg: configuration options :returns: a `WebSiteCredReview` :rtype: dict """ domain_url = dom_cred.get('itemReviewed', 'missing_website') # str itemReviewed = content.str_as_website(domain_url) # reconstruct WebSite ratingVal = dictu.get_in(dom_cred, ['credibility', 'value'], 0.0) explanation = 'based on %d review(s) by external rater(s)%s' % (len( dom_cred['assessments']), example_raters_markdown(dom_cred)) return { '@context': 'http://coinform.eu', '@type': 'WebSiteCredReview', 'additionalType': content.super_types('WebSiteCredReview'), 'itemReviewed': itemReviewed, 'text': 'Site `%s` seems *%s* %s' % (itemReviewed.get('name', '??'), credlabel.describe_credval(ratingVal, None), explanation), 'author': misinfoMeSourceCredReviewer(), 'reviewRating': { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'ratingValue': ratingVal, 'confidence': dictu.get_in(dom_cred, ['credibility', 'confidence'], 0.5), 'ratingExplanation': explanation, 'reviewCount': len(dom_cred['assessments']), 'ratingCount': len(dom_cred['assessments']) }, 'dateCreated': dom_cred.get('dateCreated', isodate.now_utc_timestamp()), 'reviewAspect': 'credibility', 'isBasedOn': [], # TODO: 'isBasedOn_assessments': dom_cred['assessments'], 'timings': dom_cred.get('timings', {}) }
def aggregate_sub_creds(sub_creds, scope_name, cfg): """Aggregates a list of credibility dicts into a single credibility dict. This is done by (i) filtering over a minimum confidence and (ii) selecting the least credible sub credibility. *deprecated* you should be moving towards using `aggregate_subReviews` which uses the schema.org compliant Reviews and Ratings. :param sub_creds: a list of dicts. Should have field `credibility` with a `value` and `confidence` :param scope_name: string to denote the scope where the sub_creds were taken from, e.g. `document` :param cfg: config options. Currently for the `cred_conf_threshold`. :returns: the aggregate credibility dict :rtype: dict """ # simplest case: if sub_creds is None or len(sub_creds) == 0: return { 'credibility': 0.0, 'confidence': 0.0, 'credibility_label': 'not verifiable', 'explanation': "No textual content found" } # filter credibilities by confidence conf_threshold = float(cfg.get('cred_conf_threshold', 0.7)) conf_subcreds = [ sc for sc in sub_creds if dictu.get_in(sc, ['credibility', 'confidence'], 0.0) > conf_threshold ] # not enough confidence in sub creds if len(conf_subcreds) == 0: sub_str = '%d sentences in %s' % (len(sub_creds), scope_name) msg = 'Could not assess credibility of %s with %s' % ( sub_str, 'sufficient confidence') return { 'credibility': 0.0, 'confidence': 0.0, 'credibility_label': 'not verifiable', 'explanation': msg } # select minimum credibility value (with sufficient confidence) sc_by_val = [sc for sc in conf_subcreds] sc_by_val = sorted(sc_by_val, key=lambda sc: sc['credibility']['value']) minval_sc = sc_by_val[0] msc_cred = minval_sc['credibility'] msg = 'Sentence in %s: %s' % (scope_name, msc_cred['explanation']) credval = msc_cred['value'] return { 'credibility': credval, 'confidence': msc_cred['confidence'], 'credibility_label': credlabel.describe_credval(credval, cred_dict=None), 'explanation': msg }
def sa_resp_to_aw_facts(sem_analysis): result = {} for fact in sem_analysis.get('facts', []): taxonomy = fact['type'] codepath = fact['factName'] ent_type = dictu.get_in(fact, ['entity', 'type']) entity = dictu.get_in(fact, ['entity', 'value']) hl = fact['hl'] aw_field = 'fact_%s_tax' % taxonomy.lower() append_field_val(result, aw_field, '%s/%s/%s' % (codepath, norm_title(ent_type), entity)) append_field_val(result, '%s_hl' % aw_field, codepath + hl) append_field_val(result, 'facts_domain_%s_tax' % taxonomy.lower(), codepath) return result
def predict_stances(qclaim_doc_bodies): url = stance_pred_url + '/predict_stance' req = qclaim_doc_bodies resp = requests.post(url, json=req, verify=False) logger.info("Response from %s %s" % (url, resp)) jresp = resp.json() return jresp['labels'], jresp['confidences'], dictu.get_in( jresp, ['meta', 'model_info'])
def websiteCredRev_as_qclaimCredRating(websiteCredRev, cfg): wscr = websiteCredRev result = { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'reviewCount': dictu.get_in(wscr, ['reviewRating', 'reviewCount'], 0), 'ratingCount': dictu.get_in(wscr, ['reviewRating', 'ratingCount'], 0), 'ratingValue': dictu.get_in(wscr, ['reviewRating', 'ratingValue'], 0.0), 'dateCreated': isodate.now_utc_timestamp() } if is_by_factchecker(websiteCredRev, cfg): # reduce domain credibility for fact-checkers, as we want to # focus on their claim reviews even if their confidence is # relatively low. # Refactoring of website_credrev.penalise_credibility penalty = float(cfg.get('factchecker_website_to_qclaim_confidence_penalty_factor', 0.5)) return { **result, 'confidence': dictu.get_in(wscr, ['reviewRating', 'confidence'], 0.0) * penalty, 'ratingExplanation': "as it was published in site `%s`. %s %s" % ( dictu.get_in(websiteCredRev, ['itemReviewed', 'name']), websiteCredRev.get('text', '(Explanation for website credibility missing)'), "However, the site is a factchecker so it publishes sentences with different credibility values.") } else: return { **result, 'confidence': dictu.get_in(wscr, ['reviewRating', 'confidence'], 0.0), 'ratingExplanation': "as it was published on site `%s`. %s" % ( dictu.get_in(websiteCredRev, ['itemReviewed', 'name']), websiteCredRev.get('text', '(Explanation for website credibility missing)')) }
def stancePredictor(cfg): if 'dev_mock_stancePredictor' in cfg: return cfg['dev_mock_stancePredictor'] claim_search_url, auth, search_verify = read_claim_search_req_params(cfg) resp = requests.post(claim_search_url, json={}, verify=search_verify, auth=auth) resp.raise_for_status() return dictu.get_in(resp.json(), ['bots', 'stancePred'])
def semSentenceEncoder(cfg): if 'dev_mock_semSentenceEncoder' in cfg: return cfg['dev_mock_semSentenceEncoder'] claim_search_url, auth, search_verify = read_claim_search_req_params(cfg) resp = requests.post(claim_search_url, json={}, verify=search_verify, auth=auth) resp.raise_for_status() return dictu.get_in(resp.json(), ['bots', 'simReviewer', 'isBasedOn'])[0]
def partition_factual_sentences(items, cfg): """Process the incoming `items` and split it between worthy and unworthy :param items: list of items :type items: list of dicts :return: a list of factual statements and a list of non-factual :rtype: lists """ worth_item_revs = rev_item_worthiness(items, cfg) logger.info("Reviewed sentence worthiness") factual_items = [ it for it in worth_item_revs if dictu.get_in( it, ['worthinessReview', 'reviewRating', 'ratingValue'], 'worthy') == 'worthy' ] nfs_items = [ it for it in worth_item_revs if dictu.get_in( it, ['worthinessReview', 'reviewRating', 'ratingValue']) == 'unworthy' ] assert len(items) == len(factual_items) + len(nfs_items), '%s' % ( 'The total number of factual and non factual items ' 'must be the same as the initial number of items sent to the process') return factual_items, nfs_items
def route_template(item_or_typename): """Returns the route template for the item :param item_or_typename: either an item (dict with `@type` field) or a typename (str) :returns: a "new style" python string template :rtype: str """ if is_item(item_or_typename): return route_template(item_or_typename['@type']) typename = item_or_typename assert type(typename) is str, 'Not a type name: %s %s' % (type(typename), typename) if typename in _acred_schema: return dictu.get_in(_acred_schema, [typename, 'route_template']) else: raise ValueError('Type name %s has not been registered' % typename)
def test_similarSent_as_SentStanceReview_05(): review = sscr.similarSent_as_SentStanceReview(relSent05, mockSimResult, {}) assert review is not None expectedFields = [ '@context', '@type', 'additionalType', 'reviewAspect', 'itemReviewed', 'reviewRating', 'dateCreated', 'author' ] isval, msg = dictu.is_value(review) assert isval, msg # with open('test/SentStanceReview/ssr05.json', 'w') as f: # json.dump(review, f, indent=2) assert set(expectedFields) == set(list(review.keys())) assert 'stance' == review['reviewAspect'] assert content.is_sentence_pair(review['itemReviewed']) assert dictu.get_in(review, ['author', '@type']) == 'SentStanceReviewer'
def super_types(item_or_typename): """Returns a list of super type names for an item or typename :param item_or_typename: either an item (dict with `@type` field) or a typename (str) :returns: a list of type names :rtype: list """ if is_item(item_or_typename): return super_types(item_or_typename['@type']) typename = item_or_typename assert type(typename) is str, 'Not a type name: %s %s' % (type(typename), typename) if typename in _acred_schema: return dictu.get_in(_acred_schema, [typename, 'super_types']) else: logger.warning('Type name %s has not been registered' % typename) return []
def ident_keys(item_or_typename): """Returns a list of ident keys for item or typename :param item_or_typename: either an item (dict with `@type` field) or a typename (str) :returns: a list of keys whose values uniquely identify the given item or typename :rtype: list """ if is_item(item_or_typename): return ident_keys(item_or_typename['@type']) typename = item_or_typename assert type(typename) is str, 'Not a type name: %s %s' % (type(typename), typename) if typename in _acred_schema: return dictu.get_in(_acred_schema, [typename, 'ident_keys']) else: raise ValueError('Type name %s has not been registered' % typename)
def select_most_confident_review(reviews, cfg): if len(reviews) == 0: return None for rev in reviews: assert content.is_review(rev), rev sorted_revs = sorted( reviews, # Note: we can sort by multiple values by returning a tuple # with confidence first it may be a good idea to do this to # make this less random if ther are multiple maxima key=lambda rev: dictu.get_in(rev, ['reviewRating', 'confidence'], -1.0 ), reverse=True) if len(sorted_revs) > 0: return sorted_revs[0] # most confident else: return None
def itemref_keys(item_or_typename): """Returns a list of itemRef keys for item or typename An itemRef key is a key whose value is another (single or a list of) item. Therefore, the values can be represented either as expanded items, but also as references to those items, typically a string with the identifier (but also possibly a url). :param item_or_typename: either an item (dict with `@type` field) or a typename (str) :returns: a list of keys for the type which refer to other items :rtype: list """ if is_item(item_or_typename): return itemref_keys(item_or_typename['@type']) typename = item_or_typename assert type(typename) is str, 'Not a type name: %s %s' % (type(typename), typename) if typename in _acred_schema: return dictu.get_in(_acred_schema, [typename, 'itemref_keys']) else: raise ValueError('Type name %s has not been registered' % typename)
def sa_resp_to_aw_relations(sem_analysis): nrels = [normalise_relation(r) for r in sem_analysis.get('relations', [])] if len(nrels) == 0: return {} result = {'relations': json.dumps(nrels)} for rel in nrels: src_type = dictu.get_in(rel, ['source', 'type']) src_val = dictu.get_in(rel, ['source', 'value']) dest_type = dictu.get_in(rel, ['destination', 'type']) dest_val = dictu.get_in(rel, ['destination', 'value']) act_type = dictu.get_in(rel, ['action', 'classification']) act_val = dictu.get_in(rel, ['action', 'value']) append_field_val(result, 'relations_entities', '%s/%s' % (src_type, src_val)) append_field_val(result, 'relations_entities', '%s/%s' % (dest_type, dest_val)) append_field_val(result, 'relations_actions', '%s/%s' % (act_type, act_val)) return result
def backward_compatible_tweetcred_predictions(preds): """Ensure that each prediction contains fields needed for backward compatibility These are fields which are used by the co-inform rule-engine: `tweet_id`, `credibility`, `confidence` and `explanation`. These should already be there if the requested acred reviewFormat was `cred_assessment`, but should be missing if the it was `schema.org` (the new, recommended output). :param preds: a list of (or an individual) prediction dicts :returns: the same list of predictions but with any missing fields for backward compatibility :rtype: list or dict """ if type(preds) is list: return [ backward_compatible_tweetcred_predictions(pred) for pred in preds ] assert type(preds) is dict pred = preds # single if 'tweet_id' not in pred: # assume schema.org format pred['tweet_id'] = dictu.get_in(pred, ['itemReviewed', 'tweet_id']) if 'credibility' not in pred: pred['credibility'] = dictu.get_in(pred, ['reviewRating', 'ratingValue']) if 'confidence' not in pred: pred['confidence'] = dictu.get_in(pred, ['reviewRating', 'confidence'], 0.0) if 'explanation' not in pred: pred['explanation'] = dictu.get_in( pred, ['reviewRating', 'ratingExplanation']) if 'ratingExplanation' not in pred: pred['ratingExplanation'] = dictu.get_in( pred, ['text'], dictu.get_in(pred, ['reviewRating', 'ratingExplanation'])) if 'ratingExplanationFormat' not in pred: pred['ratingExplanationFormat'] = 'markdown' return pred
def aggregate_subReviews(simple_sentSimReview, stanceReview, cfg): """Aggregates a similarity and stance review into a polar similarity review :param simple_sentSimReview: a (non-polar) `SentSimilarityReview` for a `sentPair` :param stanceReview: a `SentStanceReview` for the same `sentPair` as `simple_sentSimReview` :param cfg: configuration options :returns: a `SentPolarSimilarityReview` :rtype: dict """ assert simple_sentSimReview is not None if stanceReview is None: return simple_sentSimReview sim = dictu.get_in(simple_sentSimReview, ['reviewRating', 'ratingValue']) sent_stance = dictu.get_in(stanceReview, ['reviewRating', 'ratingValue'], 'unrelated') stance_conf = dictu.get_in(stanceReview, ['reviewRating', 'confidence'], '0.5') sent_pair = simple_sentSimReview['itemReviewed'] assert stanceReview['itemReviewed'] == sent_pair, '%s != %s' % ( stanceReview['itemReviewed'], sent_pair) agg_sim = calc_agg_polarsim(sim=sim, sent_stance=sent_stance, sent_stance_conf=stance_conf, cfg=cfg) sub_reviews = [ sr for sr in [simple_sentSimReview, stanceReview] if sr is not None ] sub_ratings = [ srev.get('reviewRating') for srev in sub_reviews if srev.get('reviewRating') is not None ] headline = simlabel.claim_rel_str(sim, sent_stance) # TODO: more than an explanation this is the review body # the explanation would be that one model said the sentences were x similar # while another said they were (stance) explanation = 'Sentence `%s` %s `%s`' % (dictu.get_in( sent_pair, ['sentA', 'text' ]), headline, dictu.get_in(sent_pair, ['sentB', 'text'])) sub_bots = [ simple_sentSimReview.get('author', {}), stanceReview.get('author', {}) ] return { '@context': 'http://coinform.eu', '@type': 'SentPolarSimilarityReview', 'additionalType': content.super_types('SentPolarSimilarityReview'), 'itemReviewed': sent_pair, 'headline': headline, 'reviewAspect': 'polarSimilarity', 'reviewBody': explanation, 'reviewRating': { '@type': 'AggregateRating', 'reviewAspect': 'polarSimilarity', 'ratingValue': agg_sim, 'confidence': stance_conf, 'reviewCount': len(sub_reviews), 'ratingCount': agg.total_ratingCount(sub_ratings), 'ratingExplanation': explanation }, 'isBasedOn': sub_reviews, 'dateCreated': isodate.now_utc_timestamp(), 'author': bot_info(sub_bots, cfg) }
def aggregate_subReviews(db_Sentence, claimReview, webSiteCred, cfg): """Aggregates (claim and WebSite) reviews about a DB Sentence into a credibility review :param db_Sentence: a `Sentence` in the Co-inform database :param claimReview: a `ClaimReview` for the db_Sentence. May be None if no claim review is available for the sentence. In general, the claim review will not have been normalised (i.e. mapped onto the co-inform accuracy/credibility scales) :param webSiteCred: a `WebSiteCredReview` for a webSite where the `db_Sentence` was published. :param cfg: configuration options :returns: a `DBSentCredReview` :rtype: dict """ nClaimReview = crn.normalise(claimReview, cfg) if nClaimReview is None: nClaimReview = {} nWebSiteRating = None if webSiteCred: nWebSiteRating = websiteCredRev_as_qclaimCredRating(webSiteCred, cfg) assert type(nWebSiteRating['confidence']) == float assert type(dictu.get_in(nClaimReview, ['reviewRating', 'confidence'], 0.0)) == float subRatings = [nWebSiteRating, nClaimReview.get('reviewRating', None)] subRatings = [r for r in subRatings if r is not None] sel_rating = agg.select_most_confident_rating(subRatings) or { 'ratingValue': 0.0, 'confidence': 0.0, 'ratingExplanation': 'No website or claimReview associated with this sentence' } isBasedOn = [webSiteCred, nClaimReview] isBasedOn = [ibo for ibo in isBasedOn if ibo is not None and ibo != {}] reviewCount = agg.total_reviewCount(subRatings) + len(isBasedOn) ratingCount = agg.total_ratingCount(subRatings) # should be a superset of [ibo.get('author') for ibo in isBasedOn] sub_bots = default_sub_bots(cfg) appears_in_docs = db_Sentence.get('appearance', []) appears_in_doc = appears_in_docs[0] if appears_in_docs else None link_to_doc = md_link_to_doc(appears_in_doc) revRating = { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'reviewCount': reviewCount, 'ratingCount': ratingCount, 'ratingValue': sel_rating.get('ratingValue', 0.0), 'confidence': sel_rating.get('confidence', 0.0), 'ratingExplanation': sel_rating.get('ratingExplanation') } return { '@context': "http://coinform.eu", '@type': "DBSentCredReview", 'additionalType': content.super_types('DBSentCredReview'), 'itemReviewed': db_Sentence, 'text': 'Sentence `%s` %sseems *%s* %s' % ( db_Sentence.get('text', '??'), ', in %s, ' % (link_to_doc) if link_to_doc else '', credlabel.rating_label(revRating, cfg), sel_rating.get('ratingExplanation') ), 'reviewRating': revRating, 'reviewAspect': 'credibility', 'isBasedOn': isBasedOn, 'dateCreated': isodate.now_utc_timestamp(), 'author': bot_info(sub_bots, cfg) }
def aggregate_sentReviews(sentReviews, adoc, cfg): """Combines CredReviews for sentences in adoc into an ArticleCredReview Refactoring of `aggregate_sub_creds` :param sentReviews: list of sentence CredibilityReviews. In practice, we expect a list of `AggQSentCredReview`s. :param adoc: an analysed document. The item to be reviewed. :param cfg: config options. Currently for the `cred_conf_threshold` :returns: an `ArticleCredReview` aggregating the credibility reviews of sentences in the article. :rtype: dict """ doc_mdref = markdown_ref_for_article(adoc, cfg) sub_bots = [ ] # extract sub_bot from sentReviews and make sure they match default sub_bots? author = default_bot_info(cfg) partial_ArticleCredRev = { **base_ArticleCredReview(cfg), 'author': author, 'itemReviewed': adoc, 'isBasedOn': sentReviews } # simplest case if sentReviews is None or len(sentReviews) == 0: explanation = 'we could not find any relevant claims in it.' return { **partial_ArticleCredRev, 'text': '%s is *not verifiable* as %s' % (doc_mdref, explanation), 'reviewRating': { '@type': 'Rating', 'reviewAspect': 'credibility', 'ratingValue': 0.0, 'confidence': 0.0, 'ratingExplanation': explanation } } subRatings = [ sr.get('reviewRating') for sr in sentReviews if sr.get('reviewRating') is not None ] for sr in subRatings: # really, just validating assert 'ratingValue' in sr, '%s' % (sr) assert sr['ratingValue'] is not None, '%s' % (sr) assert 'confidence' in sr, '%s' % (sr) assert sr['confidence'] is not None, '%s' % (sr) # filter by confidence conf_threshold = float(cfg.get('cred_conf_threshold', 0.7)) filter_fn = agg.filter_review_by_min_confidence(conf_threshold) conf_subRevs = [sr for sr in sentReviews if filter_fn(sr)] igno_subRevs = [sr for sr in sentReviews if not filter_fn(sr)] # not enough confidence in sentReviews if len(conf_subRevs) == 0: msg = 'we could not assess credibility of %d of its sentences with %s.%s' % ( len(sentReviews), 'sufficient confidence', ' An example: %s ' % igno_subRevs[0]['text'] if len(igno_subRevs) > 0 else '') return { **partial_ArticleCredRev, 'text': '%s is *not verifiable* as %s.' % (doc_mdref, msg), 'reviewRating': { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'ratingValue': 0.0, 'confidence': 0.0, 'ratingExplanation': msg, 'ratingCount': agg.total_ratingCount(subRatings), 'reviewCount': agg.total_reviewCount(subRatings) + len(sentReviews) } } # select least credible above the confidence threshold subRevs_by_val = sorted([sr for sr in conf_subRevs], key=lambda rev: dictu.get_in( rev, ['reviewRating', 'ratingValue'], 0.0)) least_cred_rev = subRevs_by_val[0] msg = 'like its least credible Sentence `%s` which %s' % ( dictu.get_in(least_cred_rev, ['itemReviewed', 'text'], '(missing sentence)'), dictu.get_in(least_cred_rev, ['reviewRating', 'ratingExplanation'], '(missing explanation)')) revRating = { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'ratingValue': dictu.get_in(least_cred_rev, ['reviewRating', 'ratingValue'], 0.0), 'confidence': dictu.get_in(least_cred_rev, ['reviewRating', 'confidence'], 0.0), 'ratingExplanation': msg, 'ratingCount': agg.total_ratingCount(subRatings), 'reviewCount': agg.total_reviewCount(subRatings) + len(sentReviews) } return { **partial_ArticleCredRev, 'isBasedOn': subRevs_by_val + igno_subRevs, 'text': '%s is *%s* %s' % (doc_mdref, credlabel.rating_label(revRating, cfg), msg), 'reviewRating': revRating }
def aggregate_subReviews(domcredReview, content_credReview, adoc, cfg): """Combines the domain and content credibility reviews for adoc into an AggregateRating. Refactoring of `aggregate_article_cred` :param domcredReview: a `WebsiteCredReview` for the domain/url of adoc :param content_credReview: a `` :param adoc: the article being rated, useful for generating explanations :param cfg: config options :returns: an `AggregateRating` :rtype: dict """ doc_mdref = markdown_ref_for_article(adoc, cfg) thresh = cfg.get('cred_conf_threshold', 0.7) content_conf = dictu.get_in(content_credReview, ['reviewRating', 'confidence'], 0.0) domcred_conf = dictu.get_in(domcredReview, ['reviewRating', 'confidence'], 0.0) if content_conf >= thresh: credval = dictu.get_in(content_credReview, ['reviewRating', 'ratingValue'], 0.0) cred_conf = content_conf explanation = dictu.get_in(content_credReview, ['reviewRating', 'ratingExplanation'], '') if domcred_conf >= thresh: explanation += '\nTake into account that it appeared in website `%s`. %s' % ( dictu.get_in( domcredReview, ['itemReviewed', 'name'], dictu.get_in(domcredReview, ['itemReviewed', 'url'], '(missing)')), domcredReview.get( 'text', '(Explanation for site credibility missing)')) elif domcred_conf >= thresh: credval = dictu.get_in(domcredReview, ['reviewRating', 'ratingValue'], 0.0) penalty_factor = float(cfg.get('article_from_website_conf_factor', 0.9)) webcred_thresh = float( cfg.get('article_from_website_cred_threshold_penalise', 0.2)) # penalise confidence if above a threshold # credible website can still publish false claims # but all claims in non-credible website should be questioned cred_conf = domcred_conf * penalty_factor if credval >= webcred_thresh else domcred_conf explanation = "as it appeared in website `%s`. %s" % ( dictu.get_in( domcredReview, ['itemReviewed', 'name'], dictu.get_in(domcredReview, ['itemReviewed', 'url'], '(missing)')), domcredReview.get('text', '(Explanation for site credibility missing)')) else: credval = 0.0 cred_conf = 0.0 explanation = 'we have insufficient credibility signals from text and website analyses.' contentExpl = dictu.get_in(content_credReview, ['text']) websiteExpl = dictu.get_in(domcredReview, ['text']) if contentExpl or websiteExpl: explanation += 'In case it is useful, we include the **weak** credibility signals we found:%s%s' % ( '\n * %s' % contentExpl if contentExpl else '', '\n * %s' % websiteExpl if websiteExpl else '') subRatings = [ r['reviewRating'] for r in [domcredReview, content_credReview] ] return { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'ratingValue': credval, 'confidence': cred_conf, 'ratingExplanation': explanation, 'ratingCount': agg.total_ratingCount(subRatings), 'reviewCount': agg.total_reviewCount(subRatings) + 2 }
for f, df in f2df.items(): clef_pred = [] handled_ids = [] claims = df.to_dict(orient='records') for ci, claim in enumerate(claims): logger.info('Claim %d of %d in %s' % (ci, len(claims), f)) cid = int(claim['claim_number']) if cid in handled_ids: logger.info('Skipping as previously handled') continue url = '%s/api/v1/claim/predict/credibility?claim=%s' % ( acredapi_url, claim['normalized_claim']) resp = requests.get(url, verify=False) resp.raise_for_status() claimcreds = resp.json() credRating = dictu.get_in(claimcreds[0], cred_path) clef_pred.append({ 'id': cid, 'label': acred_as_clef_label(credRating, cred_thresh) }) handled_ids.append(cid) out_dir = '%s/reviews' % (args.outFolder) if not os.path.exists(out_dir): print('Creating dir %s for the reviews' % (out_dir)) os.makedirs(out_dir) # write CredibilityReview to outFolder fname = f.replace('.txt', '_%s.json' % cid)
def aggregate_subReviews(subReviews, tweet, cfg): """Creates an aggregate review based on subReviews for tweet Refactoring of `aggregate_tweet_cred` :param subReviews: list of credibility reviews for (parts of) the tweet to review. :param cfg: config options :returns: a credibility review for the `tweet` to review that contains an `AggregateRating` based on the `subReviews` :rtype: dict """ # extract sub_bots and compare to default_sub_bots partial_TweetCredReview = { '@context': ci_context, '@type': 'TweetCredReview', 'itemReviewed': tweet, 'isBasedOn': subReviews, 'dateCreated': isodate.now_utc_timestamp(), 'author': default_bot_info(cfg) } tweet_mdref = markdown_ref_for_tweet(tweet, cfg) if subReviews is None: subReviews = [] subRatings = [ sr.get('reviewRating') for sr in subReviews if sr.get('reviewRating') is not None ] # filter by min confidence conf_threshold = float(cfg.get('cred_conf_threshold', 0.7)) filter_fn = agg.filter_review_by_min_confidence(conf_threshold) conf_subRevs = [sr for sr in subReviews if filter_fn(sr)] igno_subRevs = [sr for sr in subReviews if not filter_fn(sr)] # no (confident) subReviews if len(conf_subRevs) == 0: part_rating = { '@type': 'Rating', 'ratingValue': 0.0, 'confidence': 0.0, 'reviewAspect': 'credibility' } if len(subReviews) == 0: msg = "we could not extract (or assess credibility of) its sentences or linked documents" % ( tweet_mdref) rating = {**part_rating, 'ratingExplanation': msg} else: msg = 'we could not assess the credibility of its %d sentences or linked documents.%s' % ( len(subReviews), '\nFor example:\n * %s' % (igno_subRevs[0]['text'])) rating = { **part_rating, '@type': 'AggregateRating', 'ratingExplanation': msg, 'ratingCount': agg.total_ratingCount(subRatings), 'reviewCount': agg.total_reviewCount(subRatings) + len(subReviews) } return { **partial_TweetCredReview, 'text': '%s seems *%s* as %s' % (tweet_mdref, credlabel.rating_label(rating, cfg), msg), 'reviewRating': rating } # select least credible subReview subRevs_by_val = sorted([sr for sr in conf_subRevs], key=lambda rev: dictu.get_in( rev, ['reviewRating', 'ratingValue'], 0.0)) least_cred_rev = subRevs_by_val[0] msg = 'based on its least credible part:\n%s' % (dictu.get_in( least_cred_rev, ['text'], '(missing explanation for part)')) revRating = { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'ratingValue': dictu.get_in(least_cred_rev, ['reviewRating', 'ratingValue'], 0.0), 'confidence': dictu.get_in(least_cred_rev, ['reviewRating', 'confidence'], 0.0), 'ratingExplanation': msg, 'ratingCount': agg.total_ratingCount(subRatings), 'reviewCount': agg.total_reviewCount(subRatings) + len(subReviews) } return { **partial_TweetCredReview, 'isBasedOn': subRevs_by_val + igno_subRevs, # just a re-ordering 'text': '%s seems *%s* %s' % (tweet_mdref, credlabel.rating_label(revRating, cfg), msg), 'reviewRating': revRating }
def claimsim_result_as_aggQSentCredReview(claimsim_result, worth_rev, cfg): """Convert a `SemanticClaimSimilarityResult` into a `AggQSentCredReview` This refactors `claimsim_result_as_claimcred`. :param claimsim_result: list of SimSent reviews :param worth_rev: dict with check worthiness review :param cfg: config options :returns: a `AggQSentCredReview` :rtype: dict """ qsent = claimsim_result['q_claim'] # qsent relsents = claimsim_result['results'] # simsents itemReviewed = content.as_sentence(qsent, cfg=cfg) if len(relsents) == 0: rating = default_rating() aggqsent = { **base_AggQSentCredReview(cfg), 'itemReviewed': itemReviewed, 'text': 'Sentence `%s` seems *not verifiable* as it %s' % (itemReviewed['text'], rating['ratingExplanation']), 'reviewRating': { **rating, 'identifier': itnorm.calc_identifier(rating, cfg) }, 'isBasedOn': [worth_rev] if worth_rev else [] } result = { **aggqsent, # 'identifier': itnorm.calc_identifier(aggqsent, cfg), } return result qsent_credrevs = [ qsent_credrev.similarSent_as_QSentCredReview(simSent, claimsim_result, cfg) for simSent in relsents ] # TODO: remove subReviews if based on websiteCredRev for a factchecker (but not a claimReview) for qscr in qsent_credrevs: assert qscr['itemReviewed'] == itemReviewed assert dictu.get_in(qscr, ['reviewRating', 'reviewAspect']) == 'credibility' subRatings = [ rev.get('reviewRating') for rev in qsent_credrevs if rev.get('reviewRating') is not None ] + ([worth_rev.get('reviewRating')] if worth_rev and worth_rev.get('reviewRating') is not None else []) top_qscr = agg.select_most_confident_review(qsent_credrevs, cfg) top_rating = top_qscr.get('reviewRating', {}) reviewRating = { '@type': 'AggregateRating', 'reviewAspect': 'credibility', 'ratingValue': top_rating.get('ratingValue', 0.0), 'confidence': top_rating.get('confidence', 0.0), 'ratingExplanation': top_rating.get('ratingExplanation', None), 'ratingCount': agg.total_ratingCount(subRatings), 'reviewCount': agg.total_reviewCount(subRatings) + len(qsent_credrevs) + len([worth_rev] if worth_rev else []) } result = { **base_AggQSentCredReview(cfg), 'itemReviewed': itemReviewed, 'text': 'Sentence `%s` seems *%s* as it %s' % (itemReviewed.get( 'text', '??'), credlabel.rating_label( reviewRating, cfg), reviewRating['ratingExplanation']), 'reviewRating': { **reviewRating, 'identifier': itnorm.calc_identifier(reviewRating, cfg) }, 'isBasedOn': qsent_credrevs + ([worth_rev] if worth_rev else []) } return result
def author_url(claimReview, defValue="unknownUrl"): return dictu.get_in(claimReview, ['author', 'url'], defValue)
def filter_fn(review): return dictu.get_in(review, ['reviewRating', 'confidence'], 0.0) >= threshold