def run_fetch(self, app, args): if not args.provider and not args.uri: raise extensions.CommandUsageError() engine = scraper.Engine(app.srvs) ctx = engine.build_context(args.provider, args.uri) result = engine.fetch_one(ctx) args.output.write(result)
def run_parse(self, app, args): engine = scraper.Engine() ctx = engine.build_context(provider=args.provider, type=args.type, language=args.language) buffer = args.input.read() results = list(engine.parse_one(ctx, buffer)) output = json.dumps([x.dict() for x in results], indent=2) args.output.write(output)
def run_scrape(self, app, args): if not args.provider and not args.uri: raise extensions.CommandUsageError() engine = scraper.Engine(app.srvs) ctxs = engine.build_n_contexts(args.iterations, args.provider, args.uri, type=args.type, language=args.language) results = engine.process(*ctxs) output = json.dumps([x.dict() for x in results], indent=2) args.output.write(output)
def do_query2(self, app, args): def _parse_queryparams(pairs): for pair in pairs: key, value = pair.split('=', 1) if not key or not value: raise ValueError(pair) yield (key, value) if not args.queryparams and not args.querystring: errmsg = "filter or querystring are requierd" print(errmsg, file=sys.stderr) raise extensions.CommandUsageError() q = {} if args.querystring: q = query.Query.fromstring(args.querystring) if args.queryparams: params = dict(_parse_queryparams(args.queryparams)) q = query.Query(**params) # Setup filters before scrape anything query_engine = query.Engine() try: filters = query_engine.build_filter(q) except query.MissingFiltersError as e: errmsg = "Unknow filters: %s" errmsg = errmsg % ', '.join(e.args[0]) print(errmsg, file=sys.stderr) raise extensions.CommandUsageError() # Build scrape ctxs and process them scrape_engine = scraper.Engine() ctxs = scrape_engine.build_contexts_for_query(q) sources = scrape_engine.process(*ctxs) sources = analyze.analyze(*sources) # Pass sources thru filters results = query_engine.apply(filters, sources) results = query_engine.sort(results) # Output results = [[entity.dict(), [src.dict() for src in sources]] for (entity, sources) in results] output = json.dumps(results, indent=2, default=_json_encode_hook) args.output.write(output)
def __init__( self, settings_path: str, database_path: str, log_level: int = logging.WARNING, ): # Setup logging handler = logging.StreamHandler() handler.setFormatter(LogFormatter(defaults.LOG_FORMAT)) logger = logging.getLogger(defaults.APP_NAME) logger.setLevel(log_level) logger.addHandler(handler) # Setup filepaths touch(database_path) touch(settings_path) network_cache_path = appdirs.user_cache_dir() + "/arroyo/network" os.makedirs(network_cache_path, exist_ok=True) # Setup core self.srvs = Services( logger=logger, db=database.Database("sqlite:///" + database_path), settings=Settings( ConfigFileStorage(settings_path, root=defaults.APP_NAME)), loader=loader.ClassLoader(defaults.PLUGINS), ) if self.srvs.settings.get("cache.enabled"): self.srvs.cache = cache.DiskCache( basedir=network_cache_path, delta=self.srvs.settings.get("cache.delta"), ) # Setup engines self.scraper = scraper.Engine(self.srvs) self.filters = query.Engine(self.srvs) self.downloads = downloads.Downloads(self.srvs)