def run(date_interval, src="s3n://ooni-public/reports-sanitised/streams/", dst="s3n://ooni-public/processed/", imported_dir="s3n://ooni-public/reports-sanitised/yaml/", worker_processes=16): sch = luigi.scheduler.CentralPlannerScheduler() w = luigi.worker.Worker( scheduler=sch, worker_processes=worker_processes) imported_dates = get_imported_dates( imported_dir, aws_access_key_id=config.aws.access_key_id, aws_secret_access_key=config.aws.secret_access_key) interval = get_date_interval(date_interval) for date in interval: if str(date) not in imported_dates: continue logger.info("Running CountInterestingReports for %s on %s to %s" % (date, src, dst)) task = SparkResultsToDatabase(src=src, date=date, dst=dst) w.add(task) w.run() w.stop()
def run(src, dst_private, dst_public, date_interval, worker_processes=16): sch = luigi.scheduler.CentralPlannerScheduler() w = luigi.worker.Worker(scheduler=sch, worker_processes=worker_processes) interval = get_date_interval(date_interval) for date in interval: logger.debug("working on %s" % date) task = AggregateYAMLReports(dst_private=dst_private, dst_public=dst_public, src=src, date=date) w.add(task) w.run() w.stop()
def run(src, dst_private, dst_public, date_interval, worker_processes=16): sch = luigi.scheduler.CentralPlannerScheduler() w = luigi.worker.Worker(scheduler=sch, worker_processes=worker_processes) imported_dates = get_imported_dates(src, aws_access_key_id=config.aws.access_key_id, aws_secret_access_key=config.aws.secret_access_key) interval = get_date_interval(date_interval) for date in interval: if str(date) not in imported_dates: continue logging.info("adding headers for date: %s" % date) task = ReportHeadersToDatabase(dst_private=dst_private, dst_public=dst_public, src=src, date=date) w.add(task) w.run() w.stop()