def load_domains(cls, session, fn, site_type, ignore_inactive=False, force_inactive=False, ignore_redirected=False, exclusive=False): if exclusive: # disable existing domains of the same site type ob_expr = Site.id.asc() msites = get_msites(session, fb_kw=None, ob_expr=ob_expr) for site in msites: if site.site_type == site_type: cls.disable_site(session, site) logger.info('Sending HTTP requests to infer base URLs ...') with open(fn, 'r') as f: site_tuples = [(n + 1, line) + parse_domain(line, site_type) for n, line in enumerate(f) if not is_comment_line(line)] invalid_flag = False inactive_flag = False redirected_flag = False for n, line, site, status in site_tuples: line = line.strip('\n') if status == 'invalid': invalid_flag = True logger.error('line %i %r, invalid domain', n, line) elif status == 'inactive': inactive_flag = True logger.warning('line %i %r, domain inactive!', n, line) elif status == 'redirected': redirected_flag = True logger.warning('line %i %r, domain redirected to %s!', n, line, site['base_url']) if invalid_flag is True or \ (inactive_flag is True and (ignore_inactive is False and force_inactive is False)) or \ (redirected_flag is True and ignore_redirected is False): logger.error("""Please fix the warnings or errors above! \ Edit domains, or use --ignore-redirected to handle redirected domains', \ or Use --ignore-inactive or --force-inactive to handle inactive domains""") raise SystemExit(2) for n, line, site, status in site_tuples: if status == 'inactive' and ignore_inactive is True: continue elif status == 'redirected' and ignore_redirected is True: continue else: site['is_enabled'] = True get_or_create_m( session, Site, site, fb_uk='domain', onduplicate='update') logger.debug('Insert or update site %s', site['domain'])
def dump(cls, session, yaml_fn): """Dump all sites in the database into a yaml file.""" ob_expr = Site.id.asc() msites = get_msites(session, fb_kw=None, ob_expr=ob_expr) r = [] for ms in msites: site = CommentedMap() site['name'] = ms.name site['domain'] = ms.domain site['site_type'] = ms.site_type site['base_url'] = ms.base_url site['site_tags'] = [ dict(name=t.name, source=t.source) for t in ms.site_tags ] site['alternate_domains'] = [ dict(name=ad.name, is_alive=ad.is_alive) for ad in ms.alternate_domains ] site['is_alive'] = ms.is_alive site['is_enabled'] = ms.is_enabled article_rules = CommentedMap() site['article_rules'] = article_rules article_rules['url_regex'] = ms.article_rules['url_regex'] article_rules['update'] = [] article_rules['archive'] = [] for rule in ms.article_rules['update']: u = CommentedMap() u['spider_name'] = rule['spider_name'] u['spider_kwargs'] = rule['spider_kwargs'] article_rules['update'].append(u) for rule in ms.article_rules['archive']: a = CommentedMap() a['spider_name'] = rule['spider_name'] a['spider_kwargs'] = rule['spider_kwargs'] article_rules['archive'].append(a) r.append(site) ys = ruamel.yaml.round_trip_dump(r) head_comments = """\ # This file is generate by hoaxy site --dump command. # To understand the sites data structure, please read sites.readme.md, which # should locate under hoaxy/data/manuals/. """ # out_put with open(yaml_fn, 'w') as f: f.write(head_comments + ys) logger.info('Sites dumped into YAML file %s', yaml_fn)
def run(cls, args): """Overriding method as the entry point of this command.""" try: args = cls.args_schema.validate(args) except SchemaError as e: raise SystemExit(e) session = Session(expire_on_commit=False) # session = Session() where_expr = args['--where-expr'] ob_expr = args.get('--order-by', 'asc') limit = args['--limit'] # --fetch-url if args['--fetch-url'] is True: configure_logging('crawl.fetch-url', console_level='DEBUG', file_level='WARNING') purpose = 'update' if args['--update'] is True else 'archive' if where_expr is None: where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_URL)] else: where_expr = [text(where_expr)] ob_expr = Site.id.asc() if ob_expr == 'asc' else Site.id.desc() msites = get_msites(session, f_expr=where_expr, ob_expr=ob_expr, limit=limit) if len(msites) == 0: logger.warning("None sites you queried found in DB!") raise SystemExit(2) platform_id = get_platform_id(session, name=N_PLATFORM_WEB) # detach msites and mplatform from session, # since they definitely would not be modified in session for ms in msites: session.expunge(ms) logger.warning('Starting crawling process to fetch URL update ...') cls.fetch_url(session, msites, platform_id, purpose) elif args['--fetch-html'] is True: configure_logging('crawl.fetch-html', console_level='DEBUG', file_level='WARNING') if not session.query(Site.id).count() > 0: raise SystemExit('Your site table is empty!') q = session.query(Url.id, Url.raw) if where_expr is None: where_expr = [text(DEFAULT_WHERE_EXPR_FETCH_HTML)] else: where_expr = [text(where_expr)] ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc() q = q.filter(*where_expr).order_by(ob_expr) if limit is not None: q = q.limit(limit) logger.info( q.statement.compile(compile_kwargs={"literal_binds": True})) url_tuples = q.all() if not url_tuples: logger.warning('No such URLs in DB!') raise SystemExit(2) logger.warning('Staring crawling process to fetch HTML ...') cls.fetch_html(session, url_tuples) # --parse-article elif args['--parse-article'] is True: configure_logging('crawl.parse-article', console_level='DEBUG', file_level='WARNING') q = session.query(Url.id, Url.created_at, Url.date_published, Url.canonical, Url.site_id) if where_expr is None: where_expr = [text(DEFAULT_WHERE_EXPR_PARSE_ARTICLE)] else: where_expr = [text(where_expr)] ob_expr = Url.id.asc() if ob_expr == 'asc' else Url.id.desc() q = q.filter(*where_expr).order_by(ob_expr) if limit is not None: q = q.limit(limit) logger.info( q.statement.compile(compile_kwargs={"literal_binds": True})) url_tuples = q.all() if not url_tuples: logger.warning('No URLs found from DB!') raise SystemExit(2) logger.warning('Starting crawling process to parse article ...') cls.parse_article(session, url_tuples) session.close()
def run(cls, args): """Overriding method as the entry point of this command.""" session = Session(expire_on_commit=False) # session = Session() # expand user home for the file if args['<file>'] is not None: args['<file>'] = os.path.expanduser(args['<file>']) # --load-domains commands if args['--load-domains'] is True: configure_logging( 'site.load-domains', console_level=args['--console-log-level'], file_level='WARNING') fn = args.get('<file>', join(HOAXY_HOME, 'domains.txt')) logger.info('Loading data from file %r', fn) cls.load_domains( session, fn, site_type=args['--site-type'], ignore_inactive=args['--ignore-inactive'], force_inactive=args['--force-inactive'], ignore_redirected=args['--ignore-redirected'], exclusive=args['--exclusive']) # --load-sites commands elif args['--load-sites'] is True: configure_logging( 'site.load-sites', console_level=args['--console-log-level'], file_level='WARNING') fn = args.get('<file>', join(HOAXY_HOME, 'sites.yaml')) logger.info('Loading data from file %r', fn) cls.load_sites( session, fn, ignore_inactive=args['--ignore-inactive'], force_inactive=args['--force-inactive'], ignore_redirected=args['--ignore-redirected']) # --add commands elif args['--add'] is True: configure_logging( 'site.add', console_level=args['--console-log-level'], file_level='WARNING') msite = qquery_msite(session, domain=args['--domain']) if msite is not None: logger.warning('Site %s already exists!', args['--domain']) else: cls.add_site( session, domain=args['--domain'], site_type=args['--site-type'], name=args['--name'], tag_source=args['--tag-source'], site_tags=args['--site-tag'], alternate_domains=args['--alternate-domain'], ignore_inactive=args['--ignore-inactive'], force_inactive=args['--force-inactive'], ignore_redirected=args['--ignore-redirected']) # --add-site-tags elif args['--add-site-tags'] is True: configure_logging( 'site.add-site-tags', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.add_site_tags(session, msite, args['--tag-source'], args['--site-tag']) # --replace-site-tags elif args['--replace-site-tags'] is True: configure_logging( 'site.repalce-site-tags', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.replace_site_tags(session, msite, args['--tag-source'], args['--site-tag']) # --add-alternate-domains elif args['--add-alternate-domains'] is True: configure_logging( 'site.add-alternate-domains', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.add_alternate_domains(session, msite, args['--alternate-domain']) # --replace-alternate-domains elif args['--replace-alternate-domains'] is True: configure_logging( 'site.replace-alternate-domains', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.replace_alternate_domains(session, msite, args['--alternate-domain']) elif args['--disable'] is True: configure_logging( 'site.disable', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.disable_site(session, msite) elif args['--enable'] is True: configure_logging( 'site.enable', console_level=args['--console-log-level'], file_level='WARNING') if args['--name'] is not None: site_identity = args['--name'] else: site_identity = args['--domain'] msite = qquery_msite( session, name=args['--name'], domain=args['--domain']) if msite is None: logger.warning('Site %s does not exist!', site_identity) else: cls.enable_site(session, msite) # bulk enable sites and domains elif args['--bulk-enable'] is True: configure_logging( 'site.bulk-enable', console_level=args['--console-log-level'], file_level='WARNING') if args['--exclusive'] is True: ob_expr = Site.id.asc() msites = get_msites(session, fb_kw=None, ob_expr=ob_expr) # disable existing sites for existing_site in msites: cls.disable_site(session, existing_site) if args['--names'] is not None: site_list = args['--names'] for site in site_list: msite = qquery_msite(session, name=site, domain=None) if msite is None: logger.warning('Site %s does not exist!', site) else: cls.enable_site(session, msite) else: domain_list = args['--domains'] for domain in domain_list: msite = qquery_msite(session, name=None, domain=domain) if msite is None: logger.warning('Site %s does not exist!', domain) else: cls.enable_site(session, msite) # bulk disable sites and domains elif args['--bulk-disable'] is True: configure_logging( 'site.bulk-disable', console_level=args['--console-log-level'], file_level='WARNING') if args['--names'] is not None: site_list = args['--names'] for site in site_list: msite = qquery_msite(session, name=site, domain=None) if msite is None: logger.warning('Site %s does not exist!', site) else: cls.disable_site(session, msite) else: domain_list = args['--domains'] for domain in domain_list: msite = qquery_msite(session, name=None, domain=domain) if msite is None: logger.warning('Site %s does not exist!', domain) else: cls.disable_site(session, msite) # --status elif args['--status'] is True: configure_logging( 'site.status', console_level=args['--console-log-level'], file_level='WARNING') if args['--include-disabled'] is True: cls.site_status(session, True) else: cls.site_status(session, False) # --dump elif args['--dump'] is True: configure_logging( 'site.status', console_level=args['--console-log-level'], file_level='INFO') cls.dump(session, args['<file>']) session.close()