def is_socmedia_platform(url, domain, cfg): url_nl = content.domain_from_url(url) fc_urls = cfg.get( 'acred_socmedia_urls', ['http://twitter.com', 'http://facebook.com', 'http://instagram.com']) fc_netlocs = [content.domain_from_url(fc_url) for fc_url in fc_urls] fc_netlocs = [nl for nl in fc_netlocs if nl is not None] if url_nl is not None: # match by netloc (scheme independent) found = url_nl in fc_netlocs if found: return found elif url is not None: # failed to extract netloc, so try to match by prefix # this is *not* scheme independent, so may fail found = False for fc_url in fc_urls: if url.startswith(fc_url): logger.info('found match: %s starts with %s' % (url, fc_url)) found = True break logger.info('Found no match for %s in %s known factceckers ' % (url, len(fc_urls))) return found # no url provided, or no match for url, so match by domain if domain is None: return False else: return domain in fc_netlocs
def as_related_sent_or_claimReview(db_claim_doc, claimid2pred): start = citimings.start() multival_separator = ',' doc_urls = db_claim_doc.get('urls_ss', '').split(multival_separator) domains = db_claim_doc.get('domains_ss', '').split(multival_separator) domain = None if len(domains) == 0: # logger.warn("Claim doc is missing domains_ss") if len(doc_urls) > 0: domain = content.domain_from_url(doc_urls[0]) else: domain = domains[0] return { '@context': ci_context, '@type': 'SimilarSent', 'sentence': db_claim_doc['content_t'], 'similarity': claimid2pred.get(db_claim_doc['id'], 0.5), 'doc_url': None if len(doc_urls) == 0 else doc_urls[0], 'appearance': doc_urls, 'lang_orig': db_claim_doc.get('lang_s', None), 'published_date': db_claim_doc.get( 'published_dts', [None])[0] or db_claim_doc.get('schema_org_cr_itemReviewed_datePublished_tdt', None), 'domain': domain, 'claimReview': lookup_claimReview_url(db_claim_doc['schema_org_cr_url'], claimReview_db) }, citimings.timing('as_related_sent', start, [])
def author_name(claimReview, defValue="unknown author"): name = dictu.get_in(claimReview, ['author', 'name']) if name is None: url = dictu.get_in(claimReview, ['author', 'url']) name = content.domain_from_url(url) if name.startswith('www.'): name = name.replace('www.', '') if name.endswith('.com'): name = name.replace('.com', '') return name or defValue
def adoc_to_domain_cred(adoc, cfg): domain = adoc.get('domain', adoc.get('source_id')) if type(domain) == list: domain = None if len(domain) == 0 else domain[0] if domain is None: domain = content.domain_from_url(adoc['url']) if domain is None or domain == '': logger.warning('Missing domain for url? %s (keys %s)' % (adoc['url'], list(adoc.keys()))) return website_credrev.default_domain_crediblity( domain, "unknown domain") else: return website_credrev.calc_domain_credibility(domain)
def similarSent_as_WebSiteCredRev(simSent, cfg): assert content.is_SimilarSent(simSent), '%s' % (simSent) if 'domain_credibility' in simSent: return from_old_DomainCredibility(simSent['domain_credibility'], cfg) elif 'domain' in simSent: dom = simSent['domain'] if type(dom) is str and dom: dom = content.str_as_website(dom) if dom: return review(dom, cfg) else: return None else: doc_url = simSent['doc_url'] domain = content.str_as_website(content.domain_from_url(doc_url)) return review(domain, cfg)
def test_domain_from_url_01(): assert 'theguardian.com' == content.domain_from_url( 'http://theguardian.com/a/b') assert 'newsexaminer.net:80' == content.domain_from_url( 'https://web.archive.org/web/20150214123436/http://newsexaminer.net:80/entertainment/' )