def search_claim(q_claim): """finds similar claims or sentences in a claim database Finding similar claims or sentences in the co-inform claim database # noqa: E501 :param q_claim: This should be an English sentence or claim. Multiple sentences are not allowed. :type q_claim: str :rtype: dict """ if type(q_claim) is str: q_claims = [q_claim] if type(q_claim) is list: q_claims = q_claim if q_claim is None: raise InvalidUsage("Claim is mandatory") start = citimings.start() logger.info('Searching semantic vector space for %s claim(s)' % len( q_claims)) topn = 5 preds, claim_ids, simReviewer = search_semantic_vecspace(q_claims, topn=topn) search_semspace_t = citimings.timing('search_semantic_vecspace', start) assert len(preds) == len(claim_ids) assert len(q_claims) == len(preds) q_resp, claim_retrieve_t = retrieve_result_claims(claim_ids, q_claims, topn) start3 = citimings.start() results, sub_build_ts = [], [] for i in range(len(q_claims)): start4 = citimings.start() claim_id2pred = {idx: float(pred) for idx, pred in zip( claim_ids[i], preds[i])} relsents, sub_ts = q_resp_to_related_sent( q_resp, claim_id2pred) qclaim = q_claims[i] results.append({ '@context': ci_context, '@type': 'SemanticClaimSimilarityResult', 'dateCreated': isodate.now_utc_timestamp(), 'q_claim': qclaim, 'simReviewer': simReviewer, 'results': relsents}) sub_build_ts.append(citimings.timing('build_result', start4, sub_ts)) result_build_t = citimings.timing('build_results', start3, sub_build_ts) results, stance_pred_t = add_stance_detection( results, sim_threshold=stance_min_sim_threshold) timing = citimings.timing( 'search_claim', start, [search_semspace_t, claim_retrieve_t, result_build_t, stance_pred_t]) return { 'results': results, 'resultsHeader': { 'QTime': timing['total_ms'], 'timings': timing, 'params': { 'claim': q_claim }}}
def analyzed_doc(url, cfg): logger.info("Scraping " + url) start = citimings.start() scraped = url_scraper.scrape(url) scraped_t = citimings.timing('url_scraping', start) resolved_url = scraped['resolved_url'] # resp2 = requests.get(url) # resolved_url = resp2.url # logger.info("Resolved to " + resolved_url) # url_html = resp2.text start2 = citimings.start() ci_collections = cfg.get( 'relsents_in_colls', ['pilot-se', 'pilot-gr', 'pilot-at', 'factcheckers', 'fc-dev']) assert cisearch_available preidx_doc = cisearch.find_preindexed_doc_by_url(resolved_url, ci_collections) preidx_doc_t = citimings.timing('retrieve_preindexed', start2) if preidx_doc is not None: logger.info( 'Found previously analyzed doc with %s claims and keys %s' % ( len(preidx_doc.get('claims_content', [])), list(preidx_doc.keys()) )) preidx_doc['timings'] = citimings.timing( 'analyzed_doc', start, [scraped_t, preidx_doc_t]) return preidx_doc logger.info('Document not in existing indices, analysing...') start3 = citimings.start() result = do_analyze_doc(scraped, cfg) do_analyze_t = citimings.timing('semantic_analysis', start3) result['timings'] = citimings.timing( 'analyzed_doc', start, [scraped_t, preidx_doc_t, do_analyze_t]) return result
def extract_relevant_sentences(tweet, cfg): """Extracts all relevant sentences :param tweet: :param cfg: configuration map :returns: dict with keys `in_tweet` and `in_linked_doc` The first returns a list of sentences extracted from the tweet text proper, possibly cleaned. `in_linked_doc` has a list of sentences in documents linked by the tweet. :rtype: dict """ start = citimings.start() sent_detector = tweetsents.build_sent_detector(cfg) sent_detector_t = citimings.timing("sent_detector", start) start2 = citimings.start() sents = sent_detector(tweet['text']) sent_detection_t = citimings.timing('sent_detection', start2) in_tweet = tweetsents.build_in_tweet_info(tweet['tweet_id'], sents, cfg) # TODO 1.4 if there are complex sentences, extract clauses return { 'in_tweet': in_tweet, 'timings': citimings.timing('tweet_relevant_sentences', start, [sent_detector_t, sent_detection_t]) }
def review_sents_in_tweet(tweet, cfg): """Extracts sentenes in `tweet` and reviews their credibilities :param tweet: a `Tweet` dict :param cfg: config options :returns: a tuple with a list of credibility reviews and a list of timings for the steps. The review format depends on `cfg['acred_review_format']` :rtype: tuple """ relevant_sentences = extract_relevant_sentences(tweet, cfg) relevant_sentences_t = relevant_sentences.get('timings', None) start2 = citimings.start() intws = relevant_sentences['in_tweet'] logger.info("Found %d relevant sentences in tweet" % len(intws)) review_format = cfg.get('acred_review_format', 'schema.org') if review_format == 'cred_assessment': sent_reviews = aggqsent_credrev.calc_claim_cred( [itw['text'] for itw in intws], cfg) else: sent_reviews = aggqsent_credrev.review([ content.as_sentence(itw['text'], appearance=[tweet], cfg=cfg) for itw in intws ], cfg) sents_in_tweet_t = citimings.timing('sents_in_tweet', start2) return sent_reviews, [relevant_sentences_t, sents_in_tweet_t]
def as_related_sent_or_claimReview(db_claim_doc, claimid2pred): start = citimings.start() multival_separator = ',' doc_urls = db_claim_doc.get('urls_ss', '').split(multival_separator) domains = db_claim_doc.get('domains_ss', '').split(multival_separator) domain = None if len(domains) == 0: # logger.warn("Claim doc is missing domains_ss") if len(doc_urls) > 0: domain = content.domain_from_url(doc_urls[0]) else: domain = domains[0] return { '@context': ci_context, '@type': 'SimilarSent', 'sentence': db_claim_doc['content_t'], 'similarity': claimid2pred.get(db_claim_doc['id'], 0.5), 'doc_url': None if len(doc_urls) == 0 else doc_urls[0], 'appearance': doc_urls, 'lang_orig': db_claim_doc.get('lang_s', None), 'published_date': db_claim_doc.get( 'published_dts', [None])[0] or db_claim_doc.get('schema_org_cr_itemReviewed_datePublished_tdt', None), 'domain': domain, 'claimReview': lookup_claimReview_url(db_claim_doc['schema_org_cr_url'], claimReview_db) }, citimings.timing('as_related_sent', start, [])
def review(item, config): """Reviews the incoming item and returns a Review for it :param item: a single item or a list of items, in this case the items should be `WebSite` instances. :param config: a configuration map :returns: one or more Review objects for the input items :rtype: dict or list of dict """ if type(item) == list: return [review(it, config) for it in item] start = citimings.start() if type(item) == str: logger.warning( 'Assuming this is a website, you should wrap it into a `WebSite`') item = content.str_as_website(item) assert content.is_website(item) assert 'url' in item url = item['url'] assert type(url) == str domcred = calc_domain_credibility(url, config) result = from_old_DomainCredibility(domcred, config) return result
def calc_domain_credibility(domain, cfg={}): """Calculates a `DomainCredibility` for a domain via MisinfoMe Note that `DomainCredibility` is deprecated, use the `review` method which produces a `WebSiteCredReview` instead. :param domain: str e.g. `www.snopes.com` :returns: a `DomainCredibility` :rtype: dict """ if domain is None: return default_domain_crediblity( domain, "Default credibility for unknown domain") else: assert type(domain) == str, 'Expecting str, but was %s' (type(domain)) start = citimings.start() try: return { **misinfome_source_credibility(domain), '@context': 'DomainCredibility', '@type': 'DomainCredibility', 'dateCreated': isodate.now_utc_timestamp(), 'timings': citimings.timing('misinfome_source_credibility', start) } except Exception as e: logger.error("Failed misinfome source credibility. " + str(e)) return default_domain_crediblity( domain, "Unable to retrieve credibility assessment")
def review_linked_docs_in_tweet(tweet, cfg): """Review the credibility of any docs linked in the tweet :param tweet: a `Tweet` dict. We expect it to have a field `urls` with a list of URL objects :param cfg: :returns: a tuple with (i) a list of credibility reviews for the webpages linked in tweet and (ii) a timing object for this method :rtype: tuple """ start3 = citimings.start() # Retrieve or request doc credibilities doc_urls = [url['short_url'] for url in tweet['urls']] doc_urls = list(set(doc_urls)) # dedupe # TODO retrieve existing credibility from DB or # calculate from scratch docs = [{ '@context': 'http://schema.org', '@type': 'Webpage', 'url': url, 'mentioned_in': tweet } for url in doc_urls] doc_creds = [article_credrev.review(doc, cfg) for doc in docs] doc_creds_t = citimings.timing( 'sub_doc_cred', start3, [dc['timings'] for dc in doc_creds if 'timings' in dc]) return doc_creds, doc_creds_t
def claimsim_result_as_claimcred(claimsim_result, cfg): """Convert a `SemanticClaimSimilarityResult` into a `ClaimCredibility` :param claimsim_results: :param cfg: :returns: :rtype: """ # TODO: delegate to reviewers to convert claimsim_result into # QSentCredReview, DBClaimCredibilityReview, WebSiteCredReview, etc. agg_start = citimings.start() qsent = claimsim_result['q_claim'] # qsent relsents = claimsim_result['results'] # simsents # sentSimReviews = [ # TODO: remove, just for feedback during refactoring # semsent_simrev.similarSent_as_SentSimilarityReview(simSent, claimsim_result, cfg) # for simSent in relsents] for rs in relsents: # claim search no longer does domain credibility, so we have to do it here if 'domain_credibility' not in rs: rs['domain_credibility'] = website_credrev.calc_domain_credibility( rs['domain']) relsents = [add_relative_credibility(rs, cfg) for rs in relsents] cred_dict = aggregate_credibility(relsents, cfg) cred_dict['source'] = 'credibility of %d related claims ' % len(relsents) agg_t = citimings.timing('claim_relsent_agg', agg_start) return { '@context': ci_context, '@type': 'ClaimCredibility', 'claim': qsent, 'item_assessed': { '@context': ci_context, '@type': 'Claim', 'claim': qsent }, # 'sentenceSimilarityReview': sentSimReviews, 'aggQSentCredReview': claimsim_result_as_aggQSentCredReview(claimsim_result, cfg), 'related_claims': _partition_related_sents(relsents, cfg), 'date_assessed': isodate.now_utc_timestamp(), 'assessor': { '@context': ci_context, '@type': 'CredibilityAssessor', 'name': 'SemanticSimilarityClaimCredibilityAssessor', 'version': '20200208' }, 'credibility': cred_dict, 'timings': agg_t }
def add_stance_detection(claim_sim_results, sim_threshold=0.7): """Adds `doc_content` and `*_stance` fields to the input sim_results :param claim_sim_results: list of ClaimSimilarityResults :param sim_threshold: only perform stance detection for match results that are more similar than this value. Useful since stance detection is fairly slow. :returns: a modified `claim_sim_results` and timings :rtype: tuple """ start = citimings.start() sub_ts = [] start2 = citimings.start() result, stance_timing = do_add_stance_labels( claim_sim_results, sim_threshold=sim_threshold) sub_ts.append(stance_timing) stance_pred_t = citimings.timing('stance_pred', start2, sub_ts) return result, stance_pred_t
def retrieve_result_claims(claim_ids, q_claims, topn): claim_id_set = set(np.array(claim_ids).flatten()) logger.info("Top %d claims for %d query sents resulted in %d claims" % ( topn, len(q_claims), len(claim_id_set))) start = citimings.start() dbdocs = find_in_dbs(dbs=[preCrawled_sents_db, claimReviewed_sents_db], q_ids=list(claim_id_set)) q_resp = {'response': {'docs': dbdocs}} claim_retrieve_t = citimings.timing('retrieve_claims', start) return q_resp, claim_retrieve_t
def do_assess_doc_content_cred(adoc, cfg): start = citimings.start() claims_in_doc = select_claims_in_doc(adoc, cfg) claim_creds = aggqsent_credrev.calc_claim_cred( [claim['text'] for claim in claims_in_doc], cfg) logger.info("Found %d doc subclaim credibilities" % len(claim_creds)) content_cred = aggregate_sub_creds(claim_creds, 'document', cfg) content_cred['timings'] = citimings.timing( 'assess_doc_content_cred', start, [cc.get('timings', None) for cc in claim_creds]) return content_cred
def assess_article_cred(article, cfg): """Main credibility assessment for a single article *Deprecated* you should move to `review_article` :param article: valid and normalised article :param cfg: config to guide this assessment :returns: a credibility assessment for the article :rtype: dict """ start = citimings.start() adoc = analyzed_doc(article, cfg) adoc_t = adoc['timings'] domcred = adoc_to_domain_cred(adoc, cfg) content_cred = assess_doc_content_cred(adoc, cfg) agg_cred = aggregate_article_cred(domcred, content_cred, cfg) return { '@context': content.ci_context, '@type': 'ArticleCredibilityAssessment', 'doc_url': article['url'], 'item_assessed': article, 'date_asessed': isodate.now_utc_timestamp(), 'assessor': { '@context': content.ci_context, '@type': 'CredibilityAssessor', 'name': 'ArticleCredibilityAssessor', 'version': '20200207' }, 'doc_resolved_url': adoc.get('resolved_url', adoc.get('url')), 'analyzed_doc': adoc, **agg_cred, 'sub_assessments': [domcred, content_cred], 'timings': citimings.timing('assess_article_cred', start, [ adoc_t, domcred.get('timings', None), content_cred.get('timings', None) ]) # 'claims_in_doc': claim_creds, # 'domain_credibility': domcred, # 'content_credibility': content_cred }
def assess_doc_cred(doc, cfg): """Main credibility assessment for a single doc :param doc: a validated and normalised document, ready for credibility assessment :param cfg: any configs we need to execute/customise the assessment :returns: a credibility assessment for the doc :rtype: dict """ start = citimings.start() if content.is_tweet_doc(doc): result = tweet_credrev.review(doc, cfg) return result elif content.is_article_doc(doc): result = article_credrev.review(doc, cfg) return result else: rev_format = cfg.get('acred_review_format', 'schema.org') msg = 'Unsupported document (not a %s))' % supported_doc_types if rev_format == 'cred_assessment': return { '@context': ci_context, '@type': 'DocumentCredibilityAssessment', 'doc_url': doc['url'], 'item_assessed': doc, 'cred_assessment_error': msg, 'date_assessed': isodate.now_utc_timestamp(), 'timings': citimings.timing('assess_doc_cred', start), 'credibility': 0, 'confidence': 0, 'explanation': msg} else: rating = { '@type': 'Rating', 'ratingValue': 0.0, 'confidence': 0.0, 'ratingExplanation': msg} result = { '@context': ci_context, '@type': 'DocumentCredReview', 'reviewAspect': 'credibility', 'itemReviewed': doc, 'dateCreated': isodate.now_utc_timestamp(), 'author': bot_info([], cfg), 'reviewRating': { **rating, 'identifier': itnorm.calc_identifier(rating, cfg)} } return { **result, 'identifier': itnorm.calc_identifier(result, cfg) }
def default_domain_crediblity(domain, explanation): start = citimings.start() return { "credibility": { '@context': ci_context, '@type': 'DomainCredibility', 'item_assessed': domain, "value": 0.0, # in range [-1, 1] "confidence": 0.0, "explanation": explanation, 'timings': citimings.timing('default_domain_crediblity', start) }, "assessments": [] }
def f_composite(): start = citimings.start() sleep(0.01) f_out = f() g_out = g() result = { **f_out, **g_out, 'timings': citimings.timing('f_composite', start, [ p['timings'] for p in [f_out, g_out] if p and type(p) is dict and 'timings' in p ]) } return result
def assess_tweet_cred(tweet, cfg): """Main credibility assessment for a single tweet *Deprecated* use `review_tweet` instead :param tweet: must have field `text` with the textual content, and field `urls` with shortened urls appearing in the original tweet content :returns: a credibility assessment for the tweet :rtype: dict """ start = citimings.start() rev_format = cfg.get('acred_review_format', 'schema.org') assert rev_format == 'cred_assessment', rev_format # generate (or retrieve) sub reviews sents_in_tweet, subts = review_sents_in_tweet(tweet, cfg) doc_creds, doc_creds_t = review_linked_docs_in_tweet(tweet, cfg) start4 = citimings.start() tweet_cred = aggregate_tweet_cred(sents_in_tweet, doc_creds, cfg) result = { '@context': ci_context, '@type': 'TweetCredibilityAssessment', 'tweet_id': int(tweet['tweet_id']), 'sub_assessments': sents_in_tweet + doc_creds, 'item_assessed': tweet, 'sentences_in_tweet': sents_in_tweet, **tweet_cred } result = remove_tweet_assessment_details(result) agg_t = citimings.timing('aggregation_cleaning_time', start4) subts += [doc_creds_t, agg_t] result['timings'] = citimings.timing('assess_tweet_cred', start, subts) return result
def predict_stance(): try: start = citimings.start() req_json = request.get_json() inputs = [] if type(req_json) == list: for claim_bods in req_json: qclaim = claim_bods['qclaim'] doc_bodies = claim_bods['doc_bodies'] validate_stance_pred_q(qclaim, doc_bodies) inputs.extend([(qclaim, docbod) for docbod in doc_bodies]) else: # assume single input qclaim = req_json['qclaim'] doc_bodies = req_json['doc_bodies'] validate_stance_pred_q(qclaim, doc_bodies) inputs.extend([(qclaim, docbod) for docbod in doc_bodies]) tokmodmeta = resources.stance_tokmodmeta if len(inputs) == 0: return jsonify({ 'labels': [], 'confidences': [], 'meta': { 'model_info': tokmodmeta['model_info'], 'timings': citimings.timing('predict_stance', start), 'n_pairs': len(inputs) } }) labels, confs = stancepred.predict_stances(tokmodmeta, inputs) return jsonify({ 'labels': labels, 'confidences': confs, 'meta': { 'model_info': tokmodmeta['model_info'], 'timings': citimings.timing('predict_stance', start), 'n_pairs': len(inputs) } }) except werkzeug.exceptions.BadRequest as e: logger.exception(e) return 'bad request! ' + str(e), 400 except Exception as e: logger.exception(e) resp = jsonify({"error": str(e)}) resp.status_code = 500 return resp
def predict_worthiness(): try: tokmodmeta = resources.worthiness_tokmodmeta start = citimings.start() req_json = request.get_json() q_sents = req_json['sentences'] if q_sents is None: raise ValueError("sentences parameter is mandatory, only got %s" % (req_json)) if type(q_sents) is str: q_sents = [q_sents] if type(q_sents) is not list: raise ValueError( "Type %s not accepted. Valid formats: string or list" % type(req_json['sentences'])) if len(q_sents) == 0: label, conf, ids = [], [], [] else: label, conf = worthinesspred.cw_pred_batched(tokmodmeta, q_sents) logger.debug('predicted %s labels and %s confidences' % (len(label), len(conf))) ids = [hashu.calc_str_hash(ids) for ids in q_sents] return jsonify({ 'worthiness_checked_sentences': { 'sentences': q_sents, 'predicted_labels': label, 'prediction_confidences': conf, 'sentence_ids': ids, }, 'meta': { 'model_info': tokmodmeta['model_info'], 'timings': citimings.timing('predict_worthiness', start), } }) except werkzeug.exceptions.BadRequest as e: logger.exception(e) return 'bad request! ' + str(e), 400 except Exception as e: logger.exception(e) resp = jsonify({"error": str(e)}) resp.status_code = 500 return resp
def search_claim_bots(): """Returns a map describing the bots involved in `search_claim` :returns: a map describing the bots involved in `search_claim` :rtype: dict """ start = citimings.start() bots = { 'simReviewer': simReviewer(), # includes the sentence encoder bot! 'stancePred': stancePredictor()} timing = citimings.timing('search_claim_bots', start) return { 'results': [], # no similar sentence results 'bots': bots, 'resultsHeader': { 'QTime': timing['total_ms'], 'timings': timing, 'params': {}}}
def analyze_doc(doc, cfg): """Semantically analyses a partial `doc` and outputs a document similar to those in AW Solr :param doc: dict that must contain at least fields `content` and `id`, optional but recommended fields: `title` and various metadata fields about where the document comes from and how it was processed up to this point. :param cfg: any configuration to influence how we analyze the doc. In particular, this should tell us about AW services we can reuse to perform the analysis such as an available AW semantic-api endpoint. We assume that this endpoint will be suitable for the language of the content. :returns: an analyzed doc that aims to be compatible with the standard AW Solr schema. In particular, the output doc should combine the fields in the input doc with fields from semantic analysis such as categorization fields `taxonomy_x_tax`, entity fields `y_ss`, fact fields `fact_*_tax` However, **if you want full compatibility, you should perform a final check** based on the Solr schema in order to avoid adding fields by mistake. :rtype: dict """ assert type(doc) is dict, str(type(doc)) if 'content' not in doc: assert 'url' in doc, 'Expecting at least a url to resolve doc' scraped = url_scraper.scrape(doc['url']) doc = {**doc, **scraped} start = citimings.start() doc = try_translate(doc, cfg) analyzer_fn = get_analyzer_fn(cfg) sem_analysis = analyzer_fn(doc['content'], doc['title'], cfg) result = merge_semantic_analysis(doc, sem_analysis) timing = citimings.timing('elaboration', start) result['elaboration_elapsedtime'] = int(timing['total_ms']) if cfg.get('expand_claims', False): import semantic_analyzer.claim_content_expander as cce result['claims_content'] = cce.calc_claim_content(result, cfg) return result
def dummyPrediction(tweet): start = citimings.start() return { '@context': ci_context, '@type': 'TweetCredibilityAssessment', 'tweet_id': int(tweet['tweet_id']), 'item_assessed': tweet, 'credibility': random.random(), 'confidence': 0.0, 'explanation': 'Dummy prediction, no actual analysis performed.', 'sub_assessments': [], 'date_assessed': isodate.now_utc_timestamp(), 'assessor': {'@context': ci_context, 'name': 'dummyCredibilityPredictor'}, 'timings': citimings.timing('dummyPrediction', start) # deprecated, now as sub_assessments # 'sentences_in_tweets': [], # 'sentences_linked': [] }
def q_resp_to_related_sent(q_resp, claimid2pred): start = citimings.start() docs = q_resp['response']['docs'] def dbdoc2_resp_doc(doc): return as_related_sent_or_claimReview(doc, claimid2pred) # we are only interested in documents that appear in claimid2pred # otherwise these may be results for a differnt q_claim docs4claim = [doc for doc in docs if doc['id'] in claimid2pred] logger.info("Found %d (of %d) claims" % ( len(docs4claim), len(claimid2pred))) if len(claimid2pred) != len(docs4claim): logger.warn("Expecting %d docs, but found %d.\n%s" % ( len(claimid2pred), len(docs4claim), str(claimid2pred))) result_and_timings = [dbdoc2_resp_doc(doc) for doc in docs4claim] sub_ts = [rt[1] for rt in result_and_timings] result = [rt[0] for rt in result_and_timings] result = sorted(result, key=lambda doc: doc['similarity'], reverse=True) return result, citimings.timing('doc_as_relsent', start, sub_ts)
def calc_claim_cred(sents, cfg): """Produces ClaimCredibilityAssessments for a list of sents :param sents: list of input sentences (assumed to be claims) :param cfg: config parameters :returns: a list of coinform `ClaimCredibility` assessments :rtype: list """ start = citimings.start() claimsim_results = claimsim.find_related_sentences(sents, cfg) relsents_t = citimings.timing('find_relsents', start) result = [ claimsim_result_as_claimcred(csr, cfg) for csr in claimsim_results ] for claimcred in result: # include search timings in results agg_t = claimcred['timings'] claimcred['timings'] = citimings.timing('claimcred', start, [relsents_t, agg_t]) return result
def build_in_linked_info(tweetID, urls, cfg): # TODO: refactor # why extract claims here? let predictor assess credibility of doc # so no need to extract sentences here in_linked_doc = [] for url in urls: start = citimings.start() try: adoc = analyzed_doc(url, cfg) adoc_t = adoc['timings'] claims_in_doc = list(gen_claims_from_analysed_doc(adoc, cfg)) logger.info('Extracted %d claims from url' % len(claims_in_doc)) claims_in_doc = [ {**claim, 'url_in_tweet': url, 'timings': citimings.timing( 'url_in_tweet_claim_extraction', start, [adoc_t]), 'linked_by_tweet': tweetID} for claim in claims_in_doc] in_linked_doc.extend(claims_in_doc) except Exception as e: print("Unresolved url: ", url, "\n", str(e)) raise e return in_linked_doc
def analyzed_doc(article, cfg): """Returns an analysed version for an input article :param article: an `Article` item, really anything with fields `url`, `content` and `id`. See `semantic_analyzer.analyzer.analyze_doc`. :param cfg: config options :returns: an analyzed doc. Crucially, it will contain a field `claims_content`. See `semantic_analyzer.analyzer.analyze_doc` for basic analysed doc. :rtype: dict """ start = citimings.start() ci_colls = cfg.get('relsents_in_colls', [ 'generic', 'pilot-se', 'pilot-gr', 'pilot-at', 'factcheckers', 'fc-dev' ]) preidx_doc = gcssearch.find_preindexed_doc_by_url(article['url'], ci_colls) if preidx_doc is None: fetched = url_scraper.fetch_url(article['url']) resolved_url = fetched['resolved_url'] if resolved_url != article['url']: preidx_doc = gcssearch.find_preindexed_doc_by_url( resolved_url, ci_colls) # TODO: we may want to add the article['url'] as an alias # for this, the DB schema needs to support this and we # need to be able to submit new values for this list # of url values. Define `same_as_ss` and update # gcsearch to query and update this. preidx_t = citimings.timing('retrieve_preindexed', start) if preidx_doc is not None: preidx_doc['timings'] = preidx_t return preidx_doc else: adoc = semalyzer.analyze_doc(article, {**cfg, 'expand_claims': True}) analyze_subt = adoc.get('timings') adoc['timings'] = citimings.timing('analyzed_doc', start, [preidx_t, analyze_subt]) return adoc
def f(): start = citimings.start() sleep(0.05) result = {'a': 'b', 'c': 'd', 'timings': citimings.timing('f', start)} return result
def g(): start = citimings.start() sleep(0.1) return {'e': 'f', 'g': 'h', 'timings': citimings.timing('g', start)}
def do_add_stance_labels(claim_sim_results, sim_threshold=0.7, max_len=128): start = citimings.start() def trim(s, max_len): s_toks = s.split(' ') if len(s_toks) > max_len: return ' '.join(s_toks[:max_len]) else: return s stance_reqs = [] for cresult in claim_sim_results: q_claim = cresult['q_claim'] q_claim_toks = q_claim.split(' ') if len(q_claim_toks) > (2*max_len/3): logger.warning('Skip stance_pred: q_claim is too large %d ' % ( len(q_claim_toks))) continue bods, rs_targets = [], [] for rs in cresult['results']: if rs['similarity'] < sim_threshold: continue # docbod = rs.get('doc_content', None) # if docbod is not None: # bods.append(trim(docbod, max_len)) # rs_targets.append({"rs": rs, # 'field': 'doc_stance'}) sent = rs.get('sentence', None) if sent is not None: bods.append(sent) rs_targets.append({'rs': rs, 'field': 'sent_stance'}) if len(bods) > 0: stance_reqs.append({ 'qclaim': q_claim, 'doc_bodies': bods, 'rs_targets': rs_targets}) if len(stance_reqs) == 0: return claim_sim_results, citimings.timing( 'predict_stances', start) labels, confs, stanceRev = predict_stances( # don't send the rs_targets to server [dictu.select_keys(sr, ['qclaim', 'doc_bodies']) for sr in stance_reqs]) for csr in claim_sim_results: csr['stanceReviewer'] = stanceRev stance_docs_t = citimings.timing('doc_stance_pred', start) logger.info("Predicted stances %s with scores %s" % (labels, confs)) rs_targets = [rs_target for req in stance_reqs for rs_target in req['rs_targets']] assert len(rs_targets) == len(labels) assert len(confs) == len(labels) for rs_target, label, conf in zip(rs_targets, labels, confs): rs = rs_target['rs'] field = rs_target['field'] rs[field] = label rs['%s_confidence' % field] = conf return claim_sim_results, citimings.timing( 'predict_stances', start, [stance_docs_t])