Beispiel #1
0
    def run_fetch(self, app, args):
        if not args.provider and not args.uri:
            raise extensions.CommandUsageError()

        engine = scraper.Engine(app.srvs)
        ctx = engine.build_context(args.provider, args.uri)
        result = engine.fetch_one(ctx)
        args.output.write(result)
Beispiel #2
0
    def run_parse(self, app, args):
        engine = scraper.Engine()
        ctx = engine.build_context(provider=args.provider,
                                   type=args.type,
                                   language=args.language)
        buffer = args.input.read()

        results = list(engine.parse_one(ctx, buffer))
        output = json.dumps([x.dict() for x in results], indent=2)

        args.output.write(output)
Beispiel #3
0
    def run_scrape(self, app, args):
        if not args.provider and not args.uri:
            raise extensions.CommandUsageError()

        engine = scraper.Engine(app.srvs)
        ctxs = engine.build_n_contexts(args.iterations,
                                       args.provider,
                                       args.uri,
                                       type=args.type,
                                       language=args.language)
        results = engine.process(*ctxs)

        output = json.dumps([x.dict() for x in results], indent=2)
        args.output.write(output)
Beispiel #4
0
    def do_query2(self, app, args):
        def _parse_queryparams(pairs):
            for pair in pairs:
                key, value = pair.split('=', 1)
                if not key or not value:
                    raise ValueError(pair)

                yield (key, value)

        if not args.queryparams and not args.querystring:
            errmsg = "filter or querystring are requierd"
            print(errmsg, file=sys.stderr)
            raise extensions.CommandUsageError()

        q = {}
        if args.querystring:
            q = query.Query.fromstring(args.querystring)

        if args.queryparams:
            params = dict(_parse_queryparams(args.queryparams))
            q = query.Query(**params)

        # Setup filters before scrape anything
        query_engine = query.Engine()
        try:
            filters = query_engine.build_filter(q)
        except query.MissingFiltersError as e:
            errmsg = "Unknow filters: %s"
            errmsg = errmsg % ', '.join(e.args[0])
            print(errmsg, file=sys.stderr)
            raise extensions.CommandUsageError()

        # Build scrape ctxs and process them
        scrape_engine = scraper.Engine()
        ctxs = scrape_engine.build_contexts_for_query(q)
        sources = scrape_engine.process(*ctxs)
        sources = analyze.analyze(*sources)

        # Pass sources thru filters
        results = query_engine.apply(filters, sources)
        results = query_engine.sort(results)

        # Output
        results = [[entity.dict(), [src.dict() for src in sources]]
                   for (entity, sources) in results]
        output = json.dumps(results, indent=2, default=_json_encode_hook)
        args.output.write(output)
Beispiel #5
0
    def __init__(
        self,
        settings_path: str,
        database_path: str,
        log_level: int = logging.WARNING,
    ):
        # Setup logging
        handler = logging.StreamHandler()
        handler.setFormatter(LogFormatter(defaults.LOG_FORMAT))

        logger = logging.getLogger(defaults.APP_NAME)
        logger.setLevel(log_level)
        logger.addHandler(handler)

        # Setup filepaths
        touch(database_path)
        touch(settings_path)

        network_cache_path = appdirs.user_cache_dir() + "/arroyo/network"
        os.makedirs(network_cache_path, exist_ok=True)

        # Setup core
        self.srvs = Services(
            logger=logger,
            db=database.Database("sqlite:///" + database_path),
            settings=Settings(
                ConfigFileStorage(settings_path, root=defaults.APP_NAME)),
            loader=loader.ClassLoader(defaults.PLUGINS),
        )

        if self.srvs.settings.get("cache.enabled"):
            self.srvs.cache = cache.DiskCache(
                basedir=network_cache_path,
                delta=self.srvs.settings.get("cache.delta"),
            )

        # Setup engines
        self.scraper = scraper.Engine(self.srvs)
        self.filters = query.Engine(self.srvs)
        self.downloads = downloads.Downloads(self.srvs)