Beispiel #1
0
 def site_status(cls, session, include_disabled=False):
     """Check the status of all sites in the database."""
     inactive = []
     redirected = []
     stable = []
     if include_disabled is True:
         q = 'SELECT id, domain FROM site ORDER BY domain'
     else:
         q = 'SELECT id, domain FROM site WHERE is_enabled IS TRUE'\
             + ' ORDER BY domain'
     for sid, domain in session.execute(q).fetchall():
         base_url = infer_base_url(domain)
         if base_url is None:
             inactive.append(dict(id=sid, domain=domain))
         else:
             if owns_url(domain, base_url) is True:
                 stable.append(
                     dict(id=sid, domain=domain, base_url=base_url))
             else:
                 redirected.append(
                     dict(id=sid, domain=domain, base_url=base_url))
     logger.info('Stable sites: %s', pprint.pformat(stable))
     for o in stable:
         session.query(Site).filter_by(id=o['id'])\
             .update(dict(base_url=o['base_url']),
                     synchronize_session=False)
     session.commit()
     logger.info('Inactive sites: %s', pprint.pformat(inactive))
     logger.info('Redirected sites: %s', pprint.pformat(redirected))
Beispiel #2
0
def parse_site(site):
    """Fill the optional fields of a site dict.

    Parameters
    ----------
    site : dict

    Returns
    -------
    tuple
        (site, status)
    """
    status = 'ok'
    for k in REQ_FIELDS:
        if k not in site:
            status = 'invalid'
            break
    if 'base_url' not in site:
        if isinstance(site, str): print(site)
        if site.get('is_alive', True) is False:
            site['is_alive'] = False
            site['base_url'] = 'http://' + site['domain'] + '/'
            status = 'inactive'
        else:
            base_url = infer_base_url(site['domain'])
            if base_url is None:
                logger.warning('Domain %s is inactive!', site['domain'])
                site['base_url'] = 'http://' + site['domain'] + '/'
                site['is_alive'] = False
                status = 'inactive'
            else:
                if owns_url(site['domain'], base_url):
                    site['base_url'] = base_url
                    site['is_alive'] = True
                else:
                    status = 'redirected'
                    site['base_url'] = base_url
                    site['is_alive'] = True
    fill_rules(site)
    return (site, status)
Beispiel #3
0
def parse_domain(line, site_type):
    """Validate and parse the domain represented in the line.

    Parameters
    ----------
    line : string
    site_type : {'claim', 'fact_checking'}

    Returns
    -------
    tuple
        (site, status)
    """
    d = line.lower().strip()
    if DOMAIN_RE.match(d) is None:
        return (None, 'invalid')
    if d.startswith('www.'):
        d = d[4:]
    if len(d) <= 3:
        return (None, 'invalid')
    base_url = infer_base_url(d)
    if base_url is None:
        site = dict(
            name=d,
            domain=d,
            base_url='http://' + d + '/',
            site_type=site_type,
            is_alive=False)
        fill_rules(site)
        return (site, 'inactive')
    elif owns_url(d, base_url):
        site = dict(name=d, domain=d, base_url=base_url, site_type=site_type)
        fill_rules(site)
        return (site, 'ok')
    else:
        site = dict(name=d, domain=d, base_url=base_url, site_type=site_type)
        fill_rules(site)
        return (site, 'redirected')