예제 #1
0
def is_socmedia_platform(url, domain, cfg):
    url_nl = content.domain_from_url(url)
    fc_urls = cfg.get(
        'acred_socmedia_urls',
        ['http://twitter.com', 'http://facebook.com', 'http://instagram.com'])
    fc_netlocs = [content.domain_from_url(fc_url) for fc_url in fc_urls]
    fc_netlocs = [nl for nl in fc_netlocs if nl is not None]
    if url_nl is not None:
        # match by netloc (scheme independent)
        found = url_nl in fc_netlocs
        if found:
            return found
    elif url is not None:
        # failed to extract netloc, so try to match by prefix
        #  this is *not* scheme independent, so may fail
        found = False
        for fc_url in fc_urls:
            if url.startswith(fc_url):
                logger.info('found match: %s starts with %s' % (url, fc_url))
                found = True
                break
        logger.info('Found no match for %s in %s known factceckers ' %
                    (url, len(fc_urls)))
        return found

    # no url provided, or no match for url, so match by domain
    if domain is None:
        return False
    else:
        return domain in fc_netlocs
예제 #2
0
def as_related_sent_or_claimReview(db_claim_doc, claimid2pred):
    start = citimings.start()
    multival_separator = ','
    doc_urls = db_claim_doc.get('urls_ss', '').split(multival_separator)
    domains = db_claim_doc.get('domains_ss', '').split(multival_separator)
    domain = None
    if len(domains) == 0:
        # logger.warn("Claim doc is missing domains_ss")
        if len(doc_urls) > 0:
            domain = content.domain_from_url(doc_urls[0])
    else:
        domain = domains[0]

    return {
        '@context': ci_context,
        '@type': 'SimilarSent',
        'sentence': db_claim_doc['content_t'],
        'similarity': claimid2pred.get(db_claim_doc['id'], 0.5),
        'doc_url': None if len(doc_urls) == 0 else doc_urls[0],
        'appearance': doc_urls,
        'lang_orig': db_claim_doc.get('lang_s', None),
        'published_date': db_claim_doc.get(
            'published_dts', [None])[0] or db_claim_doc.get('schema_org_cr_itemReviewed_datePublished_tdt', None),
        'domain': domain,
        'claimReview': lookup_claimReview_url(db_claim_doc['schema_org_cr_url'], claimReview_db)
    }, citimings.timing('as_related_sent', start,
                        [])
예제 #3
0
def author_name(claimReview, defValue="unknown author"):
    name = dictu.get_in(claimReview, ['author', 'name'])
    if name is None:
        url = dictu.get_in(claimReview, ['author', 'url'])
        name = content.domain_from_url(url)
        if name.startswith('www.'):
            name = name.replace('www.', '')
        if name.endswith('.com'):
            name = name.replace('.com', '')
    return name or defValue
예제 #4
0
def adoc_to_domain_cred(adoc, cfg):
    domain = adoc.get('domain', adoc.get('source_id'))
    if type(domain) == list:
        domain = None if len(domain) == 0 else domain[0]
    if domain is None:
        domain = content.domain_from_url(adoc['url'])
    if domain is None or domain == '':
        logger.warning('Missing domain for url? %s (keys %s)' %
                       (adoc['url'], list(adoc.keys())))
        return website_credrev.default_domain_crediblity(
            domain, "unknown domain")
    else:
        return website_credrev.calc_domain_credibility(domain)
예제 #5
0
def similarSent_as_WebSiteCredRev(simSent, cfg):
    assert content.is_SimilarSent(simSent), '%s' % (simSent)
    if 'domain_credibility' in simSent:
        return from_old_DomainCredibility(simSent['domain_credibility'], cfg)
    elif 'domain' in simSent:
        dom = simSent['domain']
        if type(dom) is str and dom:
            dom = content.str_as_website(dom)
        if dom:
            return review(dom, cfg)
        else:
            return None
    else:
        doc_url = simSent['doc_url']
        domain = content.str_as_website(content.domain_from_url(doc_url))
        return review(domain, cfg)
예제 #6
0
def test_domain_from_url_01():
    assert 'theguardian.com' == content.domain_from_url(
        'http://theguardian.com/a/b')
    assert 'newsexaminer.net:80' == content.domain_from_url(
        'https://web.archive.org/web/20150214123436/http://newsexaminer.net:80/entertainment/'
    )