def run(src_directory, dst, worker_processes, limit=None, move=False): sch = luigi.scheduler.CentralPlannerScheduler() idx = 0 w = luigi.worker.Worker(scheduler=sch, worker_processes=worker_processes) uploaded_files = [] for filename in list_report_files( src_directory, aws_access_key_id=config.aws.access_key_id, aws_secret_access_key=config.aws.secret_access_key): if limit is not None and idx >= limit: break idx += 1 logging.info("uploading %s" % filename) task = S3CopyRawReport(src=filename, dst=dst, move=move) uploaded_files.append(task.output().path) w.add(task) w.run() w.stop() uploaded_dates = [] for uploaded_file in uploaded_files: uploaded_date = os.path.basename(os.path.dirname(uploaded_file)) if uploaded_date not in uploaded_dates: uploaded_dates.append(uploaded_date) return uploaded_dates
def run(srcs, dst_private, worker_processes=16): sch = luigi.scheduler.CentralPlannerScheduler() w = luigi.worker.Worker(scheduler=sch, worker_processes=worker_processes) for src in srcs: logging.info("Adding headers for src: %s" % src) report_files = list_report_files( src, key_file=config.core.ssh_private_key_file, no_host_key_check=True ) task = MoveReportFiles(report_files=list(report_files), dst_private=dst_private) w.add(task) w.run() w.stop() return report_files
def run(self): with get_luigi_target(config.ooni.bridge_db_path).open('r') as f: self.bridge_db = json.load(f) output = self.output() raw_streams = output["raw_streams"].open('w') sanitised_streams = output["sanitised_streams"].open('w') reports_path = os.path.join(self.src, self.date.strftime("%Y-%m-%d")) logger.debug("listing path %s" % reports_path) for filename in list_report_files(reports_path, config.aws.access_key_id, config.aws.secret_access_key): logger.debug("got filename %s" % filename) try: self.process_report(filename, sanitised_streams, raw_streams) except Exception: logger.error("error in processing %s" % filename) logger.error(traceback.format_exc()) raw_streams.close() sanitised_streams.close()